# Step1. DAPT (Domain Adaptive Pre-Training)

Before you begin, you need to create [step 1 dummy data](./Step0_Dummy_Data.ipynb) or prepare real data ([see here](https://github.com/NVIDIA/NeMo-Curator/tree/main/tutorials/dapt-curation)) for the actual DAPT.
If you plan to train the real model, make sure to prepare not only the domain-specific data but also general-purpose data to be used in the continued pretraining.

We use huggingface "meta-llama/Llama-3.1-8B" model for practice.

In this step, you will perform domain-adaptive tokenization and domain-adaptive continued pretraining (DAPT).


## (1) Domain-adaptive tokenization

In [None]:
import glob
import jsonlines


MODEL_ROOT_DIR = "/work/Models" # change to your path
DATA_ROOT_DIR = "/work/Data"

all_files = glob.glob(f"{DATA_ROOT_DIR}/dapt/*.jsonl") # DAPT Data Path 

all_texts = ""
for data_file in all_files:
    with jsonlines.open(data_file) as reader:
        for obj in reader:
            all_texts+=obj["text"]+"\n"
                
# Write the text data into a file
all_text_file = f"{DATA_ROOT_DIR}/all_dapt_text.txt"
with open(all_text_file, 'w') as data_fp:
  data_fp.write(all_texts)
  
print(f"Save all dapt text data to {all_text_file}")


In [None]:
tokenizer_spe_type = "bpe"
vocab_size = 100 # target vocab size for domain specific data

!python /opt/NeMo/scripts/tokenizers/process_asr_text_tokenizer.py --data_file $all_text_file --data_root=$DATA_ROOT_DIR --vocab_size=$vocab_size --tokenizer=spe --spe_type=$tokenizer_spe_type  

In [None]:
custom_tokenizer_dir = DATA_ROOT_DIR + f"/tokenizer_spe_{tokenizer_spe_type}_v{vocab_size}"

! ls $custom_tokenizer_dir

## (2) Add domain specific token to original tokenizer

In [None]:

import os
import wget
from nemo.collections import nlp as nemo_nlp
from nemo.collections import common as nemo_common
from omegaconf import OmegaConf
import huggingface_hub as hf
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
import torch

HF_LLM_MODEL = "meta-llama/Llama-3.1-8B"

domain_tokenizer = nemo_nlp.modules.get_tokenizer(tokenizer_name="sentencepiece", tokenizer_model=custom_tokenizer_dir+"/tokenizer.model")

tokenizer = AutoTokenizer.from_pretrained(HF_LLM_MODEL)
model = AutoModelForCausalLM.from_pretrained(HF_LLM_MODEL)


In [None]:
# Filtering Domain-Only Token

general_vocab = set(tokenizer.vocab.keys())
domain_vocab = set(domain_tokenizer.vocab)
domain_only_vocab = domain_vocab - general_vocab
domain_only_vocab = list(domain_only_vocab)
print("Domain Only Vocab: ", domain_only_vocab)

In [None]:
print("Ori Vocab: ", len(tokenizer))
tokenizer.add_tokens(domain_only_vocab)
model.resize_token_embeddings(len(tokenizer))
print("New Vocab: ", len(tokenizer))

## (3) Reinitialize embedding matrix on LLM

In [7]:
def get_embedding_mean(tokens, tokenizer):
    # get embedding initialize values
    embedding_layer = model.get_input_embeddings()
    embedding_values = []
    with torch.no_grad():
        for token in tokens:
            split_token = tokenizer.tokenize(token, add_special_tokens=False)
            token_ids = tokenizer.convert_tokens_to_ids(split_token)
            embeddings = embedding_layer.weight[token_ids]
            avg_embedding = embeddings.mean(dim=0)
            embedding_values.append(avg_embedding)
            
    return embedding_values

embedding_values = get_embedding_mean(domain_only_vocab, tokenizer)

In [8]:

def set_embedding_value(tokens, new_tokenizer, mean_emb_values):
    new_embedding_layer = model.get_input_embeddings()
    output_embedding_layers = model.get_output_embeddings()
    with torch.no_grad():
        for i, token in enumerate(tokens):
            token_id = new_tokenizer.convert_tokens_to_ids(token)
            new_embedding_layer.weight[token_id] = mean_emb_values[i]
            output_embedding_layers.weight[token_id] = torch.zeros_like(mean_emb_values[i])
            

set_embedding_value(domain_only_vocab, tokenizer, embedding_values)            

In [None]:
# Check Init is Okay
embedding_layer = model.get_input_embeddings()
output_embedding_layer = model.get_output_embeddings()

for i, token in enumerate(domain_only_vocab):
    token_id = tokenizer.convert_tokens_to_ids(token)
    ori_value = embedding_values[i].data.numpy()
    init_value = embedding_layer.weight[token_id].data.numpy()
    out_value = output_embedding_layer.weight[token_id].data.numpy()
    print(f"Embedding for {token}: {init_value}", "Is Same: ", ori_value==init_value)
    print(f"Output Embedding for {token}: {out_value}")

In [10]:
# Save Converted Model
new_hf_model_path = f"{MODEL_ROOT_DIR}/llama3-new-token"

tokenizer.save_pretrained(new_hf_model_path)
model.save_pretrained(new_hf_model_path)

## (4) Convert HF model to .nemo

In [None]:

nemo_ckpt_path = os.path.join(new_hf_model_path, "model.nemo")
precision = "bf16"

# Convert HF Model to NeMo
!python /opt/NeMo/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py --input_name_or_path $new_hf_model_path --output_path $nemo_ckpt_path --precision $precision --llama31 True 

## (5) Convert Jsonl data to MMAP

In [None]:
# To train a real model, you also need to transform general purpose data.

domain_data_folder = f"{DATA_ROOT_DIR}/dapt"
if not os.path.exists(f"{DATA_ROOT_DIR}/mmap"):
    os.mkdir(f"{DATA_ROOT_DIR}/mmap")
output_folder = f"{DATA_ROOT_DIR}/mmap/da_mmap"

!python /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
--input=$domain_data_folder \
--json-keys=text \
--tokenizer-library=huggingface \
--dataset-impl mmap \
--tokenizer-type $new_hf_model_path \
--output-prefix=$output_folder \
--append-eod \
--workers=4 --preproc-folder

## (6) Domain adaptive continued pretraining

In [None]:
"""
If you want to train the model, please blend the domain-specific data with general-purpose data and use them together for training.
Additionally, make sure to adjust the hyperparameters as needed.
"""

data_prefix = output_folder + "_text_document"
output_dir = "/work/log/megatron_llama_dapt"
max_steps=10 # 23200
global_batch_size=64 # 256

TP=4
PP=2

!python /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py  \
    --config-path=/opt/NeMo/examples/nlp/language_modeling/conf \
    --config-name=megatron_llama_config \
    restore_from_path=$nemo_ckpt_path \
    trainer.devices=8 \
    trainer.num_nodes=1 \
    trainer.max_steps=$max_steps \
    trainer.val_check_interval=10 \
    trainer.log_every_n_steps=5 \
    trainer.limit_val_batches=8 \
    trainer.limit_test_batches=8 \
    trainer.accumulate_grad_batches=1 \
    trainer.precision=bf16 \
    model.micro_batch_size=1 \
    model.global_batch_size=$global_batch_size \
    model.tensor_model_parallel_size=$TP \
    model.pipeline_model_parallel_size=$PP \
    model.tokenizer.library=huggingface \
    model.tokenizer.type=$new_hf_model_path \
    model.tokenizer.model=null \
    model.megatron_amp_O2=true \
    model.encoder_seq_length=4096 \
    model.sequence_parallel=true \
    ++model.data.data_prefix=[1.0,$data_prefix] \
    model.data.num_workers=8 \
    model.optim.name=fused_adam \
    model.optim.lr=5e-6 \
    model.optim.betas=[0.9,0.95] \
    exp_manager.explicit_log_dir=$output_dir \
    exp_manager.resume_if_exists=true \
    exp_manager.resume_ignore_no_checkpoint=true \
    exp_manager.create_checkpoint_callback=true \
    exp_manager.create_wandb_logger=true \
    exp_manager.wandb_logger_kwargs.project=DAPT \
    exp_manager.wandb_logger_kwargs.name=step1 \
    exp_manager.checkpoint_callback_params.monitor=val_loss \
    exp_manager.checkpoint_callback_params.save_top_k=1 \
    exp_manager.checkpoint_callback_params.mode=min \
    exp_manager.checkpoint_callback_params.always_save_nemo=false \
    exp_manager.checkpoint_callback_params.save_nemo_on_train_end=true \
    ~model.optim.sched