In [None]:
!pip install transformers
!pip install accelerate
!pip install datasets
!pip install bitsandbytes

In [None]:
import os
from pathlib import Path


HF_HOME = '/content/transformers_cache/huggingface'
HF_DATASETS_CACHE = '/content/huggingface/datasets'
TRANSFORMERS_CACHE = '/content/huggingface/models'


def re_direct_hf_cache():
    Path(HF_HOME).mkdir(parents=True, exist_ok=True)
    Path(HF_DATASETS_CACHE).mkdir(parents=True, exist_ok=True)
    Path(TRANSFORMERS_CACHE).mkdir(parents=True, exist_ok=True)

    os.environ['HF_HOME'] = HF_HOME
    os.environ['HF_DATASETS_CACHE'] = HF_DATASETS_CACHE
    os.environ['TRANSFORMERS_CACHE'] = TRANSFORMERS_CACHE

re_direct_hf_cache()

In [None]:
# import torch
# from transformers import BitsAndBytesConfig

# quantize_args = {
#     "device": "cuda:0",
#     "double_quant": True,
#     "quant_type": "nf4",
#     "bits": 4,
#     "bf16": False,
#     "fp16": True,
#     "cache_dir": None
# }

# compute_dtype = (torch.float16 if quantize_args["fp16"] else (torch.bfloat16 if quantize_args["bf16"] else torch.float32))

# bnb_model_from_pretrained_args = {}
# bnb_model_from_pretrained_args["device_map"]={"": quantize_args["device"]}
# quantization_config = BitsAndBytesConfig(
#     llm_int8_skip_modules=["mm_projector"],
#     llm_int8_threshold=6.0,
#     llm_int8_has_fp16_weight=False,
#     bnb_4bit_compute_dtype=compute_dtype,
#     bnb_4bit_use_double_quant=quantize_args["double_quant"],
#     bnb_4bit_quant_type=quantize_args["quant_type"]  # {'fp4', 'nf4'}
# )

# if quantize_args["bits"] == 4:
#     #bnb_model_from_pretrained_args["load_in_4bit"] = True
#     quantization_config.load_in_4bit = True
# else:
#     #bnb_params["load_in_8bit"] = True
#     quantization_config.load_in_8bit = True

# bnb_model_from_pretrained_args["quantization_config"] = quantization_config

In [None]:
from huggingface_hub.hf_api import HfFolder

HfFolder.save_token('YOUR HUGGING FAC API KEY')

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             # cache_dir=quantize_args["cache_dir"],
                                             # torch_dtype=(torch.bfloat16 if quantize_args["bf16"] else None),
                                             # **bnb_model_from_pretrained_args
                                             )
# model.config.use_cache = False

In [None]:
from typing import Dict
import transformers

def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
            dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
            dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

In [None]:
if tokenizer.pad_token is None:
    print(f"Adding pad token as '<pad>'")
    smart_tokenizer_and_embedding_resize(
        special_tokens_dict=dict(pad_token="<pad>"),
        tokenizer=tokenizer,
        model=model,
    )
    model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
print(tokenizer.pad_token)
print(model.config.pad_token_id, tokenizer.pad_token_id)

In [None]:
new_model = "/content/Llama-3-8B-Instruct"

model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

In [None]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from peft import PeftModel

base_tokenizer = AutoTokenizer.from_pretrained(model_id)
base_model = AutoModelForCausalLM.from_pretrained(model_id,
                                                  # low_cpu_mem_usage=True,
                                                  # return_dict=True,
                                                  # torch_dtype=torch.float16,
                                                  # device_map="auto"
                                                  )

merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model = merged_model.merge_and_unload()

# Save the merged model
# merged_model.save_pretrained("merged_model", safe_serialiaztion=True)

In [None]:
merged_model.push_to_hub("Llama-3-8B-Instruct_pad_token")
tokenizer.push_to_hub("Llama-3-8B-Instruct_pad_token")