In [1]:
# Fine-Tune Llama2-7b on custom dataset
import os, ipdb
from dataclasses import dataclass, field
from typing import Optional
import numpy as np
import torch, random
from datasets import DatasetDict, Dataset, load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from tqdm import tqdm
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer\
, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, TrainerCallback

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, EarlyStoppingCallback
from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset
from tokenizers import AddedToken
from datasets import DatasetDict, Dataset, load_from_disk
from tokenizers import AddedToken
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, EarlyStoppingCallback
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, HfArgumentParser
from transformers.optimization import Adafactor, AdafactorSchedule



# from ../evaluation_metrics import Metrics
seed = 42
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

os.environ["TOKENIZERS_PARALLELISM"] = "false" # or "true", depending on your needs

# pd.options.display.max_rows , pd.options.display.max_columns  = 100,100  

device = 'cuda' if torch.cuda.is_available() else "cpu"
device

2023-09-06 17:56:43.477764: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'cuda'

In [2]:
# import ipywidgets as widget
# widget.IntSlider()

In [3]:
# from huggingface_hub import notebook_login
# notebook_login()

In [4]:
@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default="google/flan-t5", metadata={"help": "the model name"})
    log_with: Optional[str] = field(default="wandb", metadata={"help": "use 'wandb' to log with wandb"})

    dataset_name: Optional[str] = field(default="lvwerra/stack-exchange-paired", metadata={"help": "the dataset name"})
    subset: Optional[str] = field(default="data/finetune", metadata={"help": "the subset to use"})
    split: Optional[str] = field(default="train", metadata={"help": "the split to use"})
    size_valid_set: Optional[int] = field(default=4000, metadata={"help": "the size of the validation set"})
    streaming: Optional[bool] = field(default=True, metadata={"help": "whether to stream the dataset"})
    shuffle_buffer: Optional[int] = field(default=5000, metadata={"help": "the shuffle buffer size"})
    seq_length: Optional[int] = field(default=1024, metadata={"help": "the sequence length"})
    num_workers: Optional[int] = field(default=4, metadata={"help": "the number of workers"})

    max_steps: Optional[int] = field(default=500, metadata={"help": "the maximum number of sgd steps"})
    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
    save_steps: Optional[int] = field(default=10, metadata={"help": "the saving frequency"})
    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "the per device train batch size"})
    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "the per device eval batch size"})
    gradient_accumulation_steps: Optional[int] = field(default=2, metadata={"help": "the gradient accumulation steps"})
    gradient_checkpointing: Optional[bool] = field(
        default=True, metadata={"help": "whether to use gradient checkpointing"}
    )
    group_by_length: Optional[bool] = field(default=False, metadata={"help": "whether to group by length"})
    packing: Optional[bool] = field(default=True, metadata={"help": "whether to use packing for SFTTrainer"})

    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})

    learning_rate: Optional[float] = field(default=1e-4, metadata={"help": "the learning rate"})
    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
    num_warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})

    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})


parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses([])[0]

if script_args.group_by_length and script_args.packing:
    raise ValueError("Cannot use both packing and group by length")

# script_args.per_device_train_batch_size,
script_args.gradient_accumulation_steps,
# script_args.per_device_eval_batch_size,

script_args.seq_length

# script_args.dataset_name = "./data/LLLM_TDMS_ALL_TEMPLATE/fold1"
# script_args.output_dir = "./model_ckpt/tdms_all_template_v2"
# script_args.run_name = "sft_llama2_tdms_all_Template_v2"

# script_args.dataset_name = "./data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/fold2"
# script_args.output_dir = "./model_ckpt/docteat_tdm_f2_all_template"
# script_args.run_name = "sft_llama2_docteat_tdm_f2_all_Template"

script_args.dataset_name = "../data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/fold2"
script_args.output_dir = "../model_ckpt/docteat_flan_t5_tdm_f2_all_template"
script_args.run_name = "sft_flan_t5_docteat_tdm_f2_all_Template"
script_args.seq_length = 512
script_args.per_device_train_batch_size = 6
script_args.gradient_accumulation_steps = 2
script_args.per_device_eval_batch_size = 2
script_args.max_source_length = 512
script_args.max_target_length = 512
script_args.label_pad_token_id = -100
script_args.pad_to_multiple_of = 8
script_args.model_max_length = 512

# # multi GPU
# script_args.per_device_train_batch_size = 4

# script_args.dataset_name = "./data/LLLM_LONG_TDM_ALL_TEMPLATE/fold1"
# script_args.output_dir = "./model_ckpt/long_tdm_f1_all_template"
# script_args.run_name = "sft_llama2_long_tdm_f1_all_Template"
# script_args.seq_length = 2400
# script_args.per_device_train_batch_size = 2
# script_args.gradient_accumulation_steps = 2

script_args.save_steps = 50
script_args.logging_steps = 50
script_args.streaming = False
script_args.num_train_epochs = 5
script_args.save_total_limit = 10

script_args.model_name = "google/flan-t5"
script_args.size = "large"

In [5]:
# script_args.per_device_train_batch_size,
script_args.gradient_accumulation_steps,
# script_args.per_device_eval_batch_size,

(2,)

In [6]:
# script_args.dataset_name = "../data/LLLM_TDMS_ALL_TEMPLATE/fold1"
# script_args.output_dir = "../model_ckpt/tdms_all_template_v2"
# script_args.run_name = "sft_llama2_tdms_all_Template_v2"

# script_args.dataset_name = "../data/LLLM_LONG_TDMS_ALL_TEMPLATE/fold2"
# script_args.output_dir = "../model_ckpt/long_tdms_f2_all_template"
# script_args.run_name = "sft_llama2_long_tdms_f2_all_Template"
# script_args.seq_length = 2400
# script_args.per_device_train_batch_size = 2
# script_args.gradient_accumulation_steps = 2

# script_args.save_steps = 50
# script_args.logging_steps = 50
# script_args.streaming = False
# script_args.num_train_epochs = 5
# script_args.save_total_limit = 10

In [7]:
script_args.seq_length

512

In [8]:
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""
    # text = f"Question: {example['prompt']}\n\nAnswer: {example['answer']}"
    text = f"{example['prompt']}\n{example['answer']}"
    return text


def create_datasets(tokenizer, args):
    # dataset = load_dataset(
    #     args.dataset_name,
    #     data_dir=args.subset,
    #     split=args.split,
    #     use_auth_token=True,
    #     num_proc=args.num_workers if not args.streaming else None,
    #     streaming=args.streaming,
    # )
    
    dataset = DatasetDict.load_from_disk(f"{args.dataset_name}")
    dataset = dataset.shuffle(seed=seed)
    
    # if args.streaming:
    #     print("Loading the dataset in streaming mode")
    #     valid_data = dataset.take(args.size_valid_set)
    #     train_data = dataset.skip(args.size_valid_set)
    #     train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=None)
    # else:
    
    # dataset = dataset.train_test_split(test_size=0.005, seed=None)
    train_data = dataset["train"]
    valid_data = dataset["validation"]
    print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

    chars_per_token = chars_token_ratio(train_data, tokenizer, nb_examples=400)
    # chars_per_token = chars_token_ratio(train_data, tokenizer, nb_examples=len(train_data)//2)
    # 3.70
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=prepare_sample_text,
        infinite=True,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=prepare_sample_text,
        infinite=False,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset


# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )

In [9]:
# base_model = AutoModelForCausalLM.from_pretrained(
#     script_args.model_name,
#     quantization_config=bnb_config,
#     device_map={"": 0},
#     trust_remote_code=True,
#     token="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",
#     # use_auth_token=True,
# )

# base_model.config.use_cache = False

# peft_config = LoraConfig(
#     r=script_args.lora_r,
#     lora_alpha=script_args.lora_alpha,
#     lora_dropout=script_args.lora_dropout,
#     target_modules=["q_proj", "v_proj"],
#     bias="none",
#     task_type="CAUSAL_LM",
# )

# tokenizer = AutoTokenizer.from_pretrained(
#     script_args.model_name, 
#     token="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",
#     trust_remote_code=True
# )

# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

# tokenizer.add_tokens(AddedToken("{", normalized=False))
# tokenizer.add_tokens(AddedToken("}", normalized=False))


tokenizer = AutoTokenizer.from_pretrained(f"{script_args.model_name}-{script_args.size}")

tokenizer.add_tokens(AddedToken("\n", normalized=False))
tokenizer.add_tokens(AddedToken("{", normalized=False))
tokenizer.add_tokens(AddedToken("}", normalized=False))

base_model = AutoModelForSeq2SeqLM.from_pretrained(f"{script_args.model_name}-{script_args.size}")

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=base_model,
    label_pad_token_id=script_args.label_pad_token_id,
    pad_to_multiple_of=script_args.label_pad_token_id
)

print(f"Max token lenght: {tokenizer.model_max_length}")

Max token lenght: 512


In [10]:
# https://github.com/tatsu-lab/stanford_alpaca/issues/133#issuecomment-1483893538

training_args = TrainingArguments(
    output_dir=script_args.output_dir,
    per_device_train_batch_size=script_args.per_device_train_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
    learning_rate=script_args.learning_rate,
    logging_steps=script_args.logging_steps,
    # max_steps=script_args.max_steps,
    report_to=script_args.log_with,
    save_steps=script_args.save_steps,
    group_by_length=script_args.group_by_length,
    lr_scheduler_type=script_args.lr_scheduler_type,
    warmup_steps=script_args.num_warmup_steps,
    optim=script_args.optimizer_type,
    # bf16=True,
    # fp16=True,
    remove_unused_columns=False,
    num_train_epochs = script_args.num_train_epochs,
    run_name=script_args.run_name,
)

In [11]:
print(torch.__version__)

2.0.1+cu117


In [12]:
train_dataset, eval_dataset = create_datasets(tokenizer, script_args)

Size of the train set: 82695. Size of the validation set: 35280


  0%|          | 0/400 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1062 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 400/400 [00:00<00:00, 452.76it/s]

The character to token ratio of the dataset is: 3.35





In [13]:
len(train_dataset)

82695

In [14]:
num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")

Number of GPUs available: 1


In [15]:
expected_steps = ((len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)) * training_args.num_train_epochs)// num_gpus
# expected_steps = (len(train_dataset) // (training_args.per_device_train_batch_size)) * training_args.num_train_epochs

print(f"Expected steps: {expected_steps}")


Expected steps: 34455


In [16]:
class CustomWandbCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        # Custom logs you want to add
        custom_logs = {
            "training_args": training_args,
            # ... any other custom data
        }
        wandb.log(custom_logs)  # Log the custom data to wandb

In [17]:
# wandb.log({"training_args": training_args})

# training_args

In [18]:
trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    # peft_config=peft_config,
    packing=script_args.packing,
    # max_seq_length=None,
    max_seq_length=script_args.seq_length,
    tokenizer=tokenizer,
    args=training_args,
)
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mskabongo[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670666033072244, max=1.0…

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
50,0.8473
100,0.0937



KeyboardInterrupt



In [20]:
# 34450

# MODEL SAVING

In [None]:
trainer.save_model(script_args.output_dir)

output_dir = os.path.join(script_args.output_dir, "final_checkpoint_")
output_dir = os.path.join(script_args.output_dir, f"{script_args.run_name}")

trainer.model.save_pretrained(output_dir)

# # Free memory for merging weights
# del base_model
# torch.cuda.empty_cache()

# model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
# model = model.merge_and_unload()

# output_merged_dir = os.path.join(script_args.output_dir, "final_merged_checkpoint")
# model.save_pretrained(output_merged_dir, safe_serialization=True)

In [None]:
# model = AutoPeftModelForCausalLM.from_pretrained(
#     script_args.output_dir, 
#     device_map="auto", 
#     torch_dtype=torch.bfloat16,
#     offload_folder = "offload/"
# )

# # model = model.merge_and_unload()

In [8]:
model = AutoPeftModelForCausalLM.from_pretrained(
    script_args.output_dir,
    device_map="auto",
    # low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    # load_in_4bit=True,
)

model = model.merge_and_unload()

output_merged_dir = os.path.join(script_args.output_dir, "final_merged_checkpoint")
model.save_pretrained(output_merged_dir, safe_serialization=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Inference

In [27]:
script_args.output_dir

'../model_ckpt/docteat_flan_t5_tdm_f2_all_template'

In [29]:
model = AutoPeftModelForCausalLM.from_pretrained(
    script_args.output_dir,
    # low_cpu_mem_usage=True,
    # torch_dtype=torch.bfloat16,
    # load_in_4bit=True,
)

# model.generate()

ValueError: Can't find 'adapter_config.json' at '../model_ckpt/docteat_flan_t5_tdm_f2_all_template'

In [11]:
# tokenizer = AutoTokenizer.from_pretrained(script_args.output_dir)

train_dataset, eval_dataset = create_datasets(tokenizer, script_args)

# for example in tqdm(iter(dataset)):


Size of the train set: 82680. Size of the validation set: 35295


100%|██████████| 400/400 [00:00<00:00, 515.10it/s]

The character to token ratio of the dataset is: 3.73





In [22]:
dataset = DatasetDict.load_from_disk(f"{script_args.dataset_name}")
    
    # if args.streaming:
    #     print("Loading the dataset in streaming mode")
    #     valid_data = dataset.take(args.size_valid_set)
    #     train_data = dataset.skip(args.size_valid_set)
    #     train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=None)
    # else:
    
# dataset = dataset.train_test_split(test_size=0.005, seed=None)
train_data = dataset["train"]
valid_data = dataset["validation"]

train_data[0]

{'prompt': 'Value Prediction Network This paper proposes a novel deep reinforcement learning (RL) architecture, called Value Prediction Network (VPN), which integrates model-free and model-based RL methods into a single neural network. In contrast to typical model-based RL methods, VPN learns a dynamics model whose abstract states are trained to make option-conditional predictions of future values (discounted sum of rewards) rather than of future observations. Our experimental results show that VPN has several advantages over both model-free and model-based baselines in a stochastic environment where careful planning is required but building an accurate observation-prediction model is difficult. Furthermore, VPN outperforms Deep Q-Network (DQN) on several Atari games even with short-lookahead planning, demonstrating its potential as anew way of learning a good state representation. VPN has four more hyperparameters: 1) the number of predictions steps (k) during training, 2) the plan de

In [23]:
len(valid_data)

35280

In [26]:
# model = AutoModelForSeq2SeqLM.from_pretrained(f"/root/LLLM-LeaderboardLLM/model_ckpt/docteat_flan_t5_tdm_f2_all_template/checkpoint-100")

In [24]:
DEV = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# tokenizer = AutoTokenizer.from_pretrained(script_args.output_dir)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

inputs = tokenizer.encode(f"{train_data[0]['prompt']}", return_tensors="pt").to(DEV)

generate_kwargs = dict(
    input_ids=inputs,
    temperature=0.2, 
    top_p=0.95, 
    top_k=40,
    max_new_tokens=500,
    repetition_penalty=1.3
)
outputs = model.generate(**generate_kwargs)
print(tokenizer.decode(outputs[0]))

NameError: name 'model' is not defined

In [20]:
train_data[0]['answer']

"[{'LEADERBOARD': {'Task': 'Atari Games', 'Dataset': 'Atari 2600 Seaquest', 'Metric': 'Score', 'Score': '5628'}}, {'LEADERBOARD': {'Task': 'Atari Games', 'Dataset': 'Atari 2600 Amidar', 'Metric': 'Score', 'Score': '641'}}, {'LEADERBOARD': {'Task': 'Atari Games', 'Dataset': 'Atari 2600 Krull', 'Metric': 'Score', 'Score': '15930'}}, {'LEADERBOARD': {'Task': 'Atari Games', 'Dataset': 'Atari 2600 Alien', 'Metric': 'Score', 'Score': '1429'}}, {'LEADERBOARD': {'Task': 'Atari Games', 'Dataset': 'Atari 2600 Enduro', 'Metric': 'Score', 'Score': '382'}}, {'LEADERBOARD': {'Task': 'Atari Games', 'Dataset': 'Atari 2600 Ms. Pacman', 'Metric': 'Score', 'Score': '2689'}}, {'LEADERBOARD': {'Task': 'Atari Games', 'Dataset': 'Atari 2600 Crazy Climber', 'Metric': 'Score', 'Score': '54119'}}, {'LEADERBOARD': {'Task': 'Atari Games', 'Dataset': 'Atari 2600 Q*Bert', 'Metric': 'Score', 'Score': '14517'}}, {'LEADERBOARD': {'Task': 'Atari Games', 'Dataset': 'Atari 2600 Frostbite', 'Metric': 'Score', 'Score': '38