<a href="https://colab.research.google.com/github/InovationProject/llm_customization/blob/main/Phi_3_mini_128k_4k_instruct_learning_upload.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install peft
!pip install trl
!pip install flash_attn

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-a

In [2]:
import sys
import logging

import datasets
from datasets import load_dataset
from peft import LoraConfig
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

"""
A simple example on using SFTTrainer and Accelerate to finetune Phi-3 models. For
a more advanced example, please follow HF alignment-handbook/scripts/run_sft.py.
This example has utilized DeepSpeed ZeRO3 offload to reduce the memory usage. The
script can be run on V100 or later generation GPUs. Here are some suggestions on
futher reducing memory consumption:
    - reduce batch size
    - decrease lora dimension
    - restrict lora target modules
Please follow these steps to run the script:
1. Install dependencies:
    conda install -c conda-forge accelerate
    pip3 install -i https://pypi.org/simple/ bitsandbytes
    pip3 install peft
    pip3 install deepspeed
2. Setup accelerate and deepspeed config based on the machine used:
    accelerate config
Here is a sample config for deepspeed zero3:
    compute_environment: LOCAL_MACHINE
    debug: false
    deepspeed_config:
    gradient_accumulation_steps: 1
    offload_optimizer_device: none
    offload_param_device: none
    zero3_init_flag: true
    zero3_save_16bit_model: true
    zero_stage: 3
    distributed_type: DEEPSPEED
    downcast_bf16: 'no'
    enable_cpu_affinity: false
    machine_rank: 0
    main_training_function: main
    mixed_precision: bf16
    num_machines: 1
    num_processes: 4
    rdzv_backend: static
    same_network: true
    tpu_env: []
    tpu_use_cluster: false
    tpu_use_sudo: false
    use_cpu: false
3. check accelerate config:
    accelerate env
4. Run the code:
    accelerate launch sample_finetune.py
"""

logger = logging.getLogger(__name__)


###################
# Hyper-parameters
###################
training_config = {
    "bf16": True,
    "do_eval": False,
    "learning_rate": 5.0e-06,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "max_steps": -1,
    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 4,
    "per_device_train_batch_size": 4,
    "remove_unused_columns": True,
    "save_steps": 20,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
    }

peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": "all-linear",
    "modules_to_save": None,
}
train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)


###############
# Setup logging
###############
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process a small summary
logger.warning(
    f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
    + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
)
logger.info(f"Training/evaluation parameters {train_conf}")
logger.info(f"PEFT parameters {peft_conf}")


################
# Modle Loading
################
model_id = "microsoft/Phi-3-mini-128k-instruct"   # @param ['microsoft/Phi-3-mini-128k-instruct', 'microsoft/Phi-3-mini-4k-instruct']
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",  # loading the model with flash-attenstion support
    torch_dtype=torch.bfloat16,
    device_map=None
)
model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'


##################
# Data Processing
##################
def apply_chat_template(
    example,
    tokenizer,
):
    messages = example["messages"]
    # Add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False)
    return example

learning_data = "HuggingFaceH4/ultrachat_200k"  # @param {type: "string"}

raw_dataset = load_dataset(learning_data)
train_dataset = raw_dataset["train_sft"]
test_dataset = raw_dataset["test_sft"]
column_names = list(train_dataset.features)

learning_data_num = 4000  # @param {type: "number"}

processed_train_dataset = train_dataset.select(range(learning_data_num)).map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)

processed_test_dataset = test_dataset.select(range(learning_data_num)).map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to test_sft",
)


###########
# Training
###########
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    peft_config=peft_conf,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_test_dataset,
    max_seq_length=2048,
    dataset_text_field="text",
    tokenizer=tokenizer,
    packing=True
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()


#############
# Evaluation
#############
tokenizer.padding_side = 'left'
metrics = trainer.evaluate()
metrics["eval_samples"] = len(processed_test_dataset)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


# ############
# # Save model
# ############
trainer.save_model(train_conf.output_dir)

INFO:__main__:Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 

config.json:   0%|          | 0.00/3.35k [00:00<?, ?B/s]

[INFO|configuration_utils.py:726] 2024-05-08 12:49:34,592 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/config.json


configuration_phi3.py:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
[INFO|configuration_utils.py:726] 2024-05-08 12:49:35,187 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/config.json
[INFO|configuration_utils.py:789] 2024-05-08 12:49:35,190 >> Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "interm

modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

[INFO|modeling_utils.py:3429] 2024-05-08 12:49:36,919 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/model.safetensors.index.json


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

[INFO|modeling_utils.py:1494] 2024-05-08 12:50:19,065 >> Instantiating Phi3ForCausalLM model under default dtype torch.bfloat16.
[INFO|configuration_utils.py:928] 2024-05-08 12:50:19,075 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 32000,
  "pad_token_id": 32000,
  "use_cache": false
}



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:4170] 2024-05-08 12:50:20,863 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.

[INFO|modeling_utils.py:4178] 2024-05-08 12:50:20,865 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-mini-128k-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.


generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

[INFO|configuration_utils.py:883] 2024-05-08 12:50:21,468 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/generation_config.json
[INFO|configuration_utils.py:928] 2024-05-08 12:50:21,470 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}



tokenizer_config.json:   0%|          | 0.00/3.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

[INFO|tokenization_utils_base.py:2087] 2024-05-08 12:50:25,152 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/tokenizer.model
[INFO|tokenization_utils_base.py:2087] 2024-05-08 12:50:25,153 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/tokenizer.json
[INFO|tokenization_utils_base.py:2087] 2024-05-08 12:50:25,154 >> loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/added_tokens.json
[INFO|tokenization_utils_base.py:2087] 2024-05-08 12:50:25,155 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/special_to

Downloading readme:   0%|          | 0.00/4.44k [00:00<?, ?B/s]

storing https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k/resolve/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/README.md in cache at /root/.cache/huggingface/datasets/downloads/2308d4ab5d4abace4441eb331fb4a84950783ac5fad54bdd2b1bb0e6d874829c.3ecb0462117b7560fd07d072dc04e5ed70d29811052bd142b028753831bc8e72
INFO:datasets.utils.file_utils:storing https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k/resolve/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/README.md in cache at /root/.cache/huggingface/datasets/downloads/2308d4ab5d4abace4441eb331fb4a84950783ac5fad54bdd2b1bb0e6d874829c.3ecb0462117b7560fd07d072dc04e5ed70d29811052bd142b028753831bc8e72
creating metadata file for /root/.cache/huggingface/datasets/downloads/2308d4ab5d4abace4441eb331fb4a84950783ac5fad54bdd2b1bb0e6d874829c.3ecb0462117b7560fd07d072dc04e5ed70d29811052bd142b028753831bc8e72
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/2308d4ab5d4abace4441eb331fb4a84950783a

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00000-of-00003-a3ecf92756993583.parquet in cache at /root/.cache/huggingface/datasets/downloads/2ea1c4b4b741066bce560c9899ac1e726de3810da0926439e2a8a9188efcf481
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00000-of-00003-a3ecf92756993583.parquet in cache at /root/.cache/huggingface/datasets/downloads/2ea1c4b4b741066bce560c9899ac1e726de3810da0926439e2a8a9188efcf481
creating metadata file for /root/.cache/huggingface/datasets/downloads/2ea1c4b4b741066bce560c9899ac1e726de3810da0926439e2a8a9188efcf481
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/2ea1c4b4b741066bce560c9899ac1e726de3810da0926439e2a8a9188efcf481
hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00001-of-00003-0a1804bcb6ae68c6

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00001-of-00003-0a1804bcb6ae68c6.parquet in cache at /root/.cache/huggingface/datasets/downloads/9ea1b34139edf850223cf94ec86dce854653fede94fc6e028153720541d27f3d
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00001-of-00003-0a1804bcb6ae68c6.parquet in cache at /root/.cache/huggingface/datasets/downloads/9ea1b34139edf850223cf94ec86dce854653fede94fc6e028153720541d27f3d
creating metadata file for /root/.cache/huggingface/datasets/downloads/9ea1b34139edf850223cf94ec86dce854653fede94fc6e028153720541d27f3d
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/9ea1b34139edf850223cf94ec86dce854653fede94fc6e028153720541d27f3d
hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00002-of-00003-ee46ed25cfae92c6

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00002-of-00003-ee46ed25cfae92c6.parquet in cache at /root/.cache/huggingface/datasets/downloads/9dcbe7bdea54ea9a4858bd20ce46c02987c2858888facb1b9a5955586b73dc2b
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_sft-00002-of-00003-ee46ed25cfae92c6.parquet in cache at /root/.cache/huggingface/datasets/downloads/9dcbe7bdea54ea9a4858bd20ce46c02987c2858888facb1b9a5955586b73dc2b
creating metadata file for /root/.cache/huggingface/datasets/downloads/9dcbe7bdea54ea9a4858bd20ce46c02987c2858888facb1b9a5955586b73dc2b
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/9dcbe7bdea54ea9a4858bd20ce46c02987c2858888facb1b9a5955586b73dc2b
hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/test_sft-00000-of-00001-f7dfac4afe5b93f4.

Downloading data:   0%|          | 0.00/81.2M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/test_sft-00000-of-00001-f7dfac4afe5b93f4.parquet in cache at /root/.cache/huggingface/datasets/downloads/45acaa106dce5932b3a2c04aed51da0a1c1a155e409733297d737d8f4d8bacfe
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/test_sft-00000-of-00001-f7dfac4afe5b93f4.parquet in cache at /root/.cache/huggingface/datasets/downloads/45acaa106dce5932b3a2c04aed51da0a1c1a155e409733297d737d8f4d8bacfe
creating metadata file for /root/.cache/huggingface/datasets/downloads/45acaa106dce5932b3a2c04aed51da0a1c1a155e409733297d737d8f4d8bacfe
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/45acaa106dce5932b3a2c04aed51da0a1c1a155e409733297d737d8f4d8bacfe
hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00000-of-00003-a6c9fb894be3e50b.p

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00000-of-00003-a6c9fb894be3e50b.parquet in cache at /root/.cache/huggingface/datasets/downloads/430ce1e710e84ea42e9afea2187a6b5fe037624689232f264bb75c83bf40a06a
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00000-of-00003-a6c9fb894be3e50b.parquet in cache at /root/.cache/huggingface/datasets/downloads/430ce1e710e84ea42e9afea2187a6b5fe037624689232f264bb75c83bf40a06a
creating metadata file for /root/.cache/huggingface/datasets/downloads/430ce1e710e84ea42e9afea2187a6b5fe037624689232f264bb75c83bf40a06a
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/430ce1e710e84ea42e9afea2187a6b5fe037624689232f264bb75c83bf40a06a
hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00001-of-00003-d6a0402e417f35ca

Downloading data:   0%|          | 0.00/243M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00001-of-00003-d6a0402e417f35ca.parquet in cache at /root/.cache/huggingface/datasets/downloads/608fc5291d763c102a581a38289e5c0ee3ef71fbb67972f303f321c151416837
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00001-of-00003-d6a0402e417f35ca.parquet in cache at /root/.cache/huggingface/datasets/downloads/608fc5291d763c102a581a38289e5c0ee3ef71fbb67972f303f321c151416837
creating metadata file for /root/.cache/huggingface/datasets/downloads/608fc5291d763c102a581a38289e5c0ee3ef71fbb67972f303f321c151416837
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/608fc5291d763c102a581a38289e5c0ee3ef71fbb67972f303f321c151416837
hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00002-of-00003-c0db75b92a2f48fd

Downloading data:   0%|          | 0.00/243M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00002-of-00003-c0db75b92a2f48fd.parquet in cache at /root/.cache/huggingface/datasets/downloads/cd410113c226b7528b8052ed6fcf6cf406d84e754c38c02589df6d1d8553c70f
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/train_gen-00002-of-00003-c0db75b92a2f48fd.parquet in cache at /root/.cache/huggingface/datasets/downloads/cd410113c226b7528b8052ed6fcf6cf406d84e754c38c02589df6d1d8553c70f
creating metadata file for /root/.cache/huggingface/datasets/downloads/cd410113c226b7528b8052ed6fcf6cf406d84e754c38c02589df6d1d8553c70f
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/cd410113c226b7528b8052ed6fcf6cf406d84e754c38c02589df6d1d8553c70f
hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/test_gen-00000-of-00001-3d4cd8309148a71f.

Downloading data:   0%|          | 0.00/80.4M [00:00<?, ?B/s]

storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/test_gen-00000-of-00001-3d4cd8309148a71f.parquet in cache at /root/.cache/huggingface/datasets/downloads/a5835499e7ef071327532513d9bdf5ec3ab8597bc6b37d0823c2dd2ed1375921
INFO:datasets.utils.file_utils:storing hf://datasets/HuggingFaceH4/ultrachat_200k@f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/data/test_gen-00000-of-00001-3d4cd8309148a71f.parquet in cache at /root/.cache/huggingface/datasets/downloads/a5835499e7ef071327532513d9bdf5ec3ab8597bc6b37d0823c2dd2ed1375921
creating metadata file for /root/.cache/huggingface/datasets/downloads/a5835499e7ef071327532513d9bdf5ec3ab8597bc6b37d0823c2dd2ed1375921
INFO:datasets.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/a5835499e7ef071327532513d9bdf5ec3ab8597bc6b37d0823c2dd2ed1375921
Downloading took 0.0 min
INFO:datasets.download.download_manager:Downloading took 0.0 min
Checksum Computation took 0.0 min
INFO:dat

Generating train_sft split:   0%|          | 0/207865 [00:00<?, ? examples/s]

Generating test_sft split
INFO:datasets.builder:Generating test_sft split


Generating test_sft split:   0%|          | 0/23110 [00:00<?, ? examples/s]

Generating train_gen split
INFO:datasets.builder:Generating train_gen split


Generating train_gen split:   0%|          | 0/256032 [00:00<?, ? examples/s]

Generating test_gen split
INFO:datasets.builder:Generating test_gen split


Generating test_gen split:   0%|          | 0/28304 [00:00<?, ? examples/s]

All the splits matched successfully.
INFO:datasets.utils.info_utils:All the splits matched successfully.
Dataset ultrachat_200k downloaded and prepared to /root/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb. Subsequent calls will reuse this data.
INFO:datasets.builder:Dataset ultrachat_200k downloaded and prepared to /root/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb. Subsequent calls will reuse this data.
Process #0 will write at /root/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/cache-cd28af0ed08db791_00000_of_00010.arrow
INFO:datasets.arrow_dataset:Process #0 will write at /root/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/cache-cd28af0ed08db791_00000_of_00010.arrow
Process #1 will write at /root/.cache/huggingfa

Applying chat template to train_sft (num_proc=10):   0%|          | 0/4000 [00:00<?, ? examples/s]

Caching processed dataset at /root/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/cache-cd28af0ed08db791_00000_of_00010.arrow
INFO:datasets.arrow_dataset:Caching processed dataset at /root/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/cache-cd28af0ed08db791_00000_of_00010.arrow
Caching processed dataset at /root/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/cache-cd28af0ed08db791_00001_of_00010.arrow
INFO:datasets.arrow_dataset:Caching processed dataset at /root/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/cache-cd28af0ed08db791_00001_of_00010.arrow
Caching processed dataset at /root/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/cache-cd28af0ed08db791_

Applying chat template to test_sft (num_proc=10):   0%|          | 0/4000 [00:00<?, ? examples/s]

Caching processed dataset at /root/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/cache-4fcc14d18dc3bf59_00000_of_00010.arrow
Caching processed dataset at /root/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/cache-4fcc14d18dc3bf59_00001_of_00010.arrow
INFO:datasets.arrow_dataset:Caching processed dataset at /root/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/cache-4fcc14d18dc3bf59_00000_of_00010.arrow
INFO:datasets.arrow_dataset:Caching processed dataset at /root/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/cache-4fcc14d18dc3bf59_00001_of_00010.arrow
Caching processed dataset at /root/.cache/huggingface/datasets/HuggingFaceH4___ultrachat_200k/default/0.0.0/f8e46c0ce6e7cfa42c393cb56add1db4ea9548fb/cache-4fcc14d18dc3bf59_

Generating train split: 0 examples [00:00, ? examples/s]

Unable to verify splits sizes.
INFO:datasets.utils.info_utils:Unable to verify splits sizes.
Dataset generator downloaded and prepared to /root/.cache/huggingface/datasets/generator/default-d6c573df5db672be/0.0.0. Subsequent calls will reuse this data.
INFO:datasets.builder:Dataset generator downloaded and prepared to /root/.cache/huggingface/datasets/generator/default-d6c573df5db672be/0.0.0. Subsequent calls will reuse this data.
Using custom data configuration default-64bd0dd7f292f52d
INFO:datasets.builder:Using custom data configuration default-64bd0dd7f292f52d
Loading Dataset Infos from /usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/generator
INFO:datasets.info:Loading Dataset Infos from /usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/generator
Generating dataset generator (/root/.cache/huggingface/datasets/generator/default-64bd0dd7f292f52d/0.0.0)
INFO:datasets.builder:Generating dataset generator (/root/.cache/huggingface/datasets/generator/de

Generating train split: 0 examples [00:00, ? examples/s]

Unable to verify splits sizes.
INFO:datasets.utils.info_utils:Unable to verify splits sizes.
Dataset generator downloaded and prepared to /root/.cache/huggingface/datasets/generator/default-64bd0dd7f292f52d/0.0.0. Subsequent calls will reuse this data.
INFO:datasets.builder:Dataset generator downloaded and prepared to /root/.cache/huggingface/datasets/generator/default-64bd0dd7f292f52d/0.0.0. Subsequent calls will reuse this data.
[INFO|trainer.py:626] 2024-05-08 12:51:33,551 >> Using auto half precision backend
[INFO|trainer.py:2048] 2024-05-08 12:51:33,886 >> ***** Running training *****
[INFO|trainer.py:2049] 2024-05-08 12:51:33,887 >>   Num examples = 2,688
[INFO|trainer.py:2050] 2024-05-08 12:51:33,888 >>   Num Epochs = 1
[INFO|trainer.py:2051] 2024-05-08 12:51:33,889 >>   Instantaneous batch size per device = 4
[INFO|trainer.py:2054] 2024-05-08 12:51:33,890 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
[INFO|trainer.py:2055] 2024-05-08 12:51:33,891 >> 

Step,Training Loss
20,1.2612
40,1.2205
60,1.2279
80,1.195
100,1.1278
120,1.1739
140,1.1581
160,1.1542
180,1.1693
200,1.1554


[INFO|trainer.py:3305] 2024-05-08 12:52:09,126 >> Saving model checkpoint to ./checkpoint_dir/checkpoint-20
[INFO|configuration_utils.py:726] 2024-05-08 12:52:09,686 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/config.json
[INFO|configuration_utils.py:789] 2024-05-08 12:52:09,689 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "ph

***** train metrics *****
  epoch                    =         1.0
  total_flos               = 115287070GF
  train_loss               =       1.109
  train_runtime            =  0:19:35.59
  train_samples_per_second =       2.286
  train_steps_per_second   =       0.572


[INFO|trainer.py:3305] 2024-05-08 13:16:43,655 >> Saving model checkpoint to ./checkpoint_dir


***** eval metrics *****
  epoch                   =        1.0
  eval_loss               =     1.0787
  eval_runtime            = 0:05:34.14
  eval_samples            =       4000
  eval_samples_per_second =      8.119
  eval_steps_per_second   =      2.032


[INFO|configuration_utils.py:726] 2024-05-08 13:16:44,312 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/config.json
[INFO|configuration_utils.py:789] 2024-05-08 13:16:44,315 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_po

In [3]:
!cp -a /content/checkpoint_dir/checkpoint-660 Phi_3_mini_custom_128k_instruct

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)
checkpoint = '/content/checkpoint_dir/checkpoint-660'  # @param {type: "string"}

model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


[INFO|configuration_utils.py:726] 2024-05-08 13:17:42,723 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/config.json
[INFO|configuration_utils.py:726] 2024-05-08 13:17:43,256 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/config.json
[INFO|configuration_utils.py:789] 2024-05-08 13:17:43,258 >> Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:4170] 2024-05-08 13:17:46,893 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.

[INFO|modeling_utils.py:4178] 2024-05-08 13:17:46,895 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-mini-128k-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.
[INFO|configuration_utils.py:883] 2024-05-08 13:17:47,198 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/generation_config.json
[INFO|configuration_utils.py:928] 2024-05-08 13:17:47,200 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}

[INFO|tokenization_utils_base.py:2085] 2024-05-08 13:17:47,751 

In [5]:
prompt = 'Which famous landmarks should I visit in London, beyond the usual ones?'  # @param {type: "string"}

messages = [
    {"role": "user", "content": prompt},
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])




 London is a city with a rich history and culture, and there are many lesser-known landmarks that are worth visiting. Here are some suggestions:

1. The Tower of London: Apart from the Crown Jewels, the Tower of London has a fascinating history and is home to the Crown Jewels, the Crown Jewels of Scotland, and the Royal Mint.

2. The British Museum: The British Museum is one of the world's most famous museums, and it houses a vast collection of art, artifacts, and historical objects from around the world.

3. The Natural History Museum: The Natural History Museum is one of the most popular museums in London, and it has a vast collection of dinosaur fossils, minerals, and other natural history specimens.

4. The Tate Modern: The Tate Modern is a contemporary art museum that houses a vast collection of modern and contemporary art.

5. The Science Museum: The Science Museum is a science museum that has a vast collection of scientific artifacts, including the original Wright Brothers' airp

In [6]:
import gc
from peft import PeftModel

# メモリ解放
del model
gc.collect()
gc.collect()
torch.cuda.empty_cache()

# モデル名
base_model = "microsoft/Phi-3-mini-128k-instruct"
new_model = "Phi_3_mini_custom_128k_instruct"

# トークナイザの準備
tokenizer = AutoTokenizer.from_pretrained(base_model)

# モデルの準備
fp16_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

# アダプタをベースモデルにマージ
model = PeftModel.from_pretrained(fp16_model, new_model)
model = model.merge_and_unload()

[INFO|tokenization_utils_base.py:2087] 2024-05-08 13:18:51,939 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/tokenizer.model
[INFO|tokenization_utils_base.py:2087] 2024-05-08 13:18:51,940 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/tokenizer.json
[INFO|tokenization_utils_base.py:2087] 2024-05-08 13:18:51,941 >> loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/added_tokens.json
[INFO|tokenization_utils_base.py:2087] 2024-05-08 13:18:51,942 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/special_to

The repository for microsoft/Phi-3-mini-128k-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3-mini-128k-instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


[INFO|configuration_utils.py:726] 2024-05-08 13:18:55,997 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/config.json
[INFO|configuration_utils.py:789] 2024-05-08 13:18:55,999 >> Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-128k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-128k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-128k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "origi

The repository for microsoft/Phi-3-mini-128k-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3-mini-128k-instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


[INFO|modeling_utils.py:3429] 2024-05-08 13:20:12,928 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/model.safetensors.index.json
[INFO|modeling_utils.py:1494] 2024-05-08 13:20:12,931 >> Instantiating Phi3ForCausalLM model under default dtype torch.float16.
[INFO|configuration_utils.py:928] 2024-05-08 13:20:12,934 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 32000,
  "pad_token_id": 32000
}



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:4170] 2024-05-08 13:20:17,459 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.

[INFO|modeling_utils.py:4178] 2024-05-08 13:20:17,460 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-mini-128k-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.
[INFO|configuration_utils.py:883] 2024-05-08 13:20:17,722 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-128k-instruct/snapshots/8a362e755d2faf8cec2bf98850ce2216023d178a/generation_config.json
[INFO|configuration_utils.py:928] 2024-05-08 13:20:17,723 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}



In [7]:
# HuggingFace Hubへのアップロード
new_model = "Phi_3_mini_custom_128k_instruct"
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

[INFO|configuration_utils.py:471] 2024-05-08 13:20:23,194 >> Configuration saved in Phi_3_mini_custom_128k_instruct/config.json
[INFO|configuration_utils.py:697] 2024-05-08 13:20:23,196 >> Configuration saved in Phi_3_mini_custom_128k_instruct/generation_config.json
[INFO|modeling_utils.py:2598] 2024-05-08 13:20:48,853 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at Phi_3_mini_custom_128k_instruct/model.safetensors.index.json.
[INFO|hub.py:757] 2024-05-08 13:21:12,509 >> Uploading the following files to Songqiao/Phi_3_mini_custom_128k_instruct: model.safetensors.index.json,README.md,generation_config.json,model-00002-of-00002.safetensors,model-00001-of-00002.safetensors,config.json


model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

[INFO|tokenization_utils_base.py:2488] 2024-05-08 13:24:33,655 >> tokenizer config file saved in Phi_3_mini_custom_128k_instruct/tokenizer_config.json
[INFO|tokenization_utils_base.py:2497] 2024-05-08 13:24:33,657 >> Special tokens file saved in Phi_3_mini_custom_128k_instruct/special_tokens_map.json
[INFO|hub.py:757] 2024-05-08 13:24:33,696 >> Uploading the following files to Songqiao/Phi_3_mini_custom_128k_instruct: tokenizer.json,added_tokens.json,README.md,special_tokens_map.json,tokenizer_config.json,tokenizer.model


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Songqiao/Phi_3_mini_custom_128k_instruct/commit/3d48d3b1d0c3eda7b1c6ab5891998a9906e43ce4', commit_message='Upload tokenizer', commit_description='', oid='3d48d3b1d0c3eda7b1c6ab5891998a9906e43ce4', pr_url=None, pr_revision=None, pr_num=None)