In [1]:
# !pip install -q transformers --user
# !pip install -q accelerate --user
# !pip install -q bitsandbytes --user

[0m

In [2]:
# Fine-Tune Llama2-7b on custom dataset
import os, ipdb
from dataclasses import dataclass, field
from typing import Optional
import numpy as np
import torch, random
from datasets import DatasetDict, Dataset, load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from tqdm import tqdm
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer\
, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, TrainerCallback, pipeline

from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset

from evaluation_metrics import Metrics

# from ../evaluation_metrics import Metrics
seed = 42
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

os.environ["TOKENIZERS_PARALLELISM"] = "false" # or "true", depending on your needs

# pd.options.display.max_rows , pd.options.display.max_columns  = 100,100  

device = 'cuda' if torch.cuda.is_available() else "cpu"
device

2023-12-09 18:43:36.916456: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-09 18:43:36.975415: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'cuda'

In [3]:
# import ipywidgets as widget
# widget.IntSlider()

In [4]:
# from huggingface_hub import notebook_login
# notebook_login()

In [5]:
@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the model name"})
    log_with: Optional[str] = field(default="wandb", metadata={"help": "use 'wandb' to log with wandb"})

    dataset_name: Optional[str] = field(default="lvwerra/stack-exchange-paired", metadata={"help": "the dataset name"})
    subset: Optional[str] = field(default="data/finetune", metadata={"help": "the subset to use"})
    split: Optional[str] = field(default="train", metadata={"help": "the split to use"})
    size_valid_set: Optional[int] = field(default=4000, metadata={"help": "the size of the validation set"})
    streaming: Optional[bool] = field(default=True, metadata={"help": "whether to stream the dataset"})
    shuffle_buffer: Optional[int] = field(default=5000, metadata={"help": "the shuffle buffer size"})
    seq_length: Optional[int] = field(default=1024, metadata={"help": "the sequence length"})
    num_workers: Optional[int] = field(default=4, metadata={"help": "the number of workers"})

    max_steps: Optional[int] = field(default=500, metadata={"help": "the maximum number of sgd steps"})
    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
    save_steps: Optional[int] = field(default=10, metadata={"help": "the saving frequency"})
    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "the per device train batch size"})
    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "the per device eval batch size"})
    gradient_accumulation_steps: Optional[int] = field(default=2, metadata={"help": "the gradient accumulation steps"})
    gradient_checkpointing: Optional[bool] = field(
        default=True, metadata={"help": "whether to use gradient checkpointing"}
    )
    group_by_length: Optional[bool] = field(default=False, metadata={"help": "whether to group by length"})
    packing: Optional[bool] = field(default=True, metadata={"help": "whether to use packing for SFTTrainer"})

    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})

    learning_rate: Optional[float] = field(default=1e-4, metadata={"help": "the learning rate"})
    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
    num_warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})

    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})


parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses([])[0]

if script_args.group_by_length and script_args.packing:
    raise ValueError("Cannot use both packing and group by length")

In [6]:
# script_args.per_device_train_batch_size,
script_args.gradient_accumulation_steps,
# script_args.per_device_eval_batch_size,

(2,)

In [7]:
# script_args.model_name = "meta-llama/Llama-2-13b-hf"
# script_args.size = "13b"

script_args.model_name = "meta-llama/Llama-2-7b-hf"
# script_args.model_name = "mistralai/Mistral-7B-v0.1"
# script_args.model_name = "/kaggle/input/mistral/pytorch/7b-v0.1-hf/1"

script_args.size = "7b"


# script_args.checkpoint = "checkpoint-25200"
script_args.seq_length = 2400

# script_args.dataset_name = "./data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/fold2"
# script_args.output_dir = "./model_ckpt/docteat_tdm_f2_all_template"
# script_args.run_name = "sft_llama2_docteat_tdm_f2_all_Template"

# # script_args.dataset_name = "./data/LLLM_DOCTEAT_TDMS_ALL_TEMPLATE/fold2"
# # script_args.output_dir = f"./model_ckpt/docteat_llama2_{script_args.size}_tdms_f2_all_template"
# # script_args.run_name = f"sft_docteat_llama2_{script_args.size}_tdms_f2_all_Template"
# script_args.dataset_name = "../data/LLLM_DOCTEAT_TDMS_ALL_TEMPLATE/fold1"
# script_args.output_dir = f"../model_ckpt/docteat_llama2_{script_args.size}_tdms_f1_all_template"
# script_args.run_name = f"sft_docteat_llama2_{script_args.size}_tdms_f1_all_Template"
# script_args.seq_length = 1024
# script_args.per_device_train_batch_size = 4
# script_args.gradient_accumulation_steps = 2
# # multi GPU
# script_args.per_device_eval_batch_size = 1

i = 1
script_args.test_dataset = f"../data/LLLM_LONG_SUMMARIZED_TDMS_SQUAD_{i}/fold1"
# script_args.dataset_name = "../data/LLLM_LONG_TDMS_ALL_TEMPLATE/fold1"
script_args.dataset_name = "../data/LLLM_LONG_SUMMARIZED_TDMS_ALL_TEMPLATE/fold1"
script_args.output_dir = f"../model_ckpt/long_summ_llama2_{script_args.size}_tdms_f1_all_template_seq_{script_args.seq_length}"
script_args.run_name = f"sft_long_summ_llama2_{script_args.size}_tdms_f1_all_template_seq_{script_args.seq_length}"
script_args.per_device_train_batch_size = 3
script_args.gradient_accumulation_steps = 2
script_args.per_device_eval_batch_size = 2

script_args.save_steps = 2
script_args.logging_steps = 2
script_args.streaming = False
script_args.num_train_epochs = 2
script_args.save_total_limit = 1
script_args.max_steps = 2

In [8]:
script_args.seq_length

2400

In [9]:
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""
    text = f"Question: {example['prompt']}\n\nAnswer: {example['answer']}"
    # text = f"{example['prompt']}\n{example['answer']}"
    return text

def create_datasets(tokenizer, args):
    # dataset = load_dataset(
    #     args.dataset_name,
    #     data_dir=args.subset,
    #     split=args.split,
    #     use_auth_token=True,
    #     num_proc=args.num_workers if not args.streaming else None,
    #     streaming=args.streaming,
    # )
    
    dataset = DatasetDict.load_from_disk(f"{args.dataset_name}")
    dataset = dataset.shuffle(seed=seed)
    
    # if args.streaming:
    #     print("Loading the dataset in streaming mode")
    #     valid_data = dataset.take(args.size_valid_set)
    #     train_data = dataset.skip(args.size_valid_set)
    #     train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=None)
    # else:
    
    # # dataset = dataset.train_test_split(test_size=0.005, seed=None)
    # train_data = dataset["train"]
    # valid_data = dataset["validation"]

    train_data = dataset["train"].shard(num_shards=2000, index=0)
    valid_data = dataset["validation"].shard(num_shards=2000, index=0)
    
    print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

    chars_per_token = chars_token_ratio(train_data, tokenizer, nb_examples=600)
    # chars_per_token = chars_token_ratio(train_data, tokenizer, nb_examples=len(train_data)//2)
    # 3.70
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=prepare_sample_text,
        infinite=True,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=prepare_sample_text,
        infinite=False,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    script_args.model_name,
    quantization_config=bnb_config,
    device_map={"": 0},
    trust_remote_code=True,
    use_auth_token="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",
    # use_auth_token=True,
)

base_model.config.use_cache = False

peft_config = LoraConfig(
    r=script_args.lora_r,
    lora_alpha=script_args.lora_alpha,
    lora_dropout=script_args.lora_dropout,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

tokenizer = AutoTokenizer.from_pretrained(
    script_args.model_name, 
    use_auth_token="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",
    trust_remote_code=True
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training



Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# https://github.com/tatsu-lab/stanford_alpaca/issues/133#issuecomment-1483893538

# training_args = TrainingArguments(
#     output_dir=script_args.output_dir,
#     per_device_train_batch_size=script_args.per_device_train_batch_size,
#     gradient_accumulation_steps=script_args.gradient_accumulation_steps,
#     per_device_eval_batch_size=script_args.per_device_eval_batch_size,
#     learning_rate=script_args.learning_rate,
#     logging_steps=script_args.logging_steps,
#     # max_steps=script_args.max_steps,
#     # report_to=script_args.log_with,
#     save_steps=script_args.save_steps,
#     group_by_length=script_args.group_by_length,
#     lr_scheduler_type=script_args.lr_scheduler_type,
#     warmup_steps=script_args.num_warmup_steps,
#     optim=script_args.optimizer_type,
#     # bf16=True,
#     # fp16=True,
#     fp16=False,
#     bf16=False,
#     remove_unused_columns=False,
#     num_train_epochs = script_args.num_train_epochs,
#     run_name=script_args.run_name,
#     evaluation_strategy="steps",
#     save_strategy="steps",
# )

In [None]:
print(torch.__version__)

In [None]:
train_dataset, eval_dataset = create_datasets(tokenizer, script_args)

In [None]:
len(train_dataset)

In [None]:
num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")

In [None]:
expected_steps = ((len(train_dataset) // (script_args.per_device_train_batch_size * script_args.gradient_accumulation_steps)) * script_args.num_train_epochs)// num_gpus
# expected_steps = (len(train_dataset) // (training_args.per_device_train_batch_size)) * training_args.num_train_epochs

print(f"Expected steps: {expected_steps}")


In [None]:
class CustomWandbCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        # Custom logs you want to add
        custom_logs = {
            "training_args": training_args,
            # ... any other custom data
        }
        wandb.log(custom_logs)  # Log the custom data to wandb

class CustomTrainer(SFTTrainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.best_train_losses = []

    def training_step(self, model, inputs):
        """
        Perform a training step. The model parameters will be updated.
        """
        # Call the original training step
        loss = super().training_step(model, inputs)

        current_train_loss = loss.item()

        

        # Save model if it's among the top 10 best training losses
        self.best_train_losses = sorted(self.best_train_losses + [(current_train_loss, self.state.global_step)], key=lambda x: x[0])[:training_args.save_total_limit]

        if len(self.best_train_losses) == training_args.save_total_limit:
            worst_loss, worst_step = self.best_train_losses[-1]
            
            if current_train_loss < worst_loss:
                # Delete the checkpoint with the worst loss
                checkpoint_folder = os.path.join(training_args.output_dir, f'checkpoint-{worst_step}')
                # ipdb.set_trace()
                if os.path.exists(checkpoint_folder) and is_main_process(self.args.local_rank):
                    ipdb.set_trace()
                    os.remove(checkpoint_folder)

        return loss

    def evaluate(self, eval_dataset=None, ignore_keys=None):
        metrics = super().evaluate(eval_dataset, ignore_keys)

        # Let's say the key for validation loss in the metrics dictionary is 'eval_loss'
        current_val_loss = metrics['eval_loss']

        # Sort the losses and get the worst (maximum) loss from our saved checkpoints
        self.best_val_losses = sorted(self.best_val_losses + [(current_val_loss, self.state.global_step)], key=lambda x: x[0])[:training_args.save_total_limit]

        if len(self.best_val_losses) == training_args.save_total_limit:
            worst_loss, worst_step = self.best_val_losses[-1]
            ipdb.set_trace()
            if current_val_loss < worst_loss:
                # Delete the checkpoint with the worst loss
                checkpoint_folder = os.path.join(training_args.output_dir, f'checkpoint-{worst_step}')
                if os.path.exists(checkpoint_folder) and is_main_process(self.args.local_rank):
                    os.remove(checkpoint_folder)

        return metrics


        
def compute_metrics(eval_preds):

    preds, labels = eval_preds

    ipdb.set_trace()
    
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)  # type: ignore
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)  # type: ignore
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    ipdb.set_trace()

    print(tokenizer.decode(outputs[0]))

    results = Metrics.evaluate_property_wise_json_based(label_list=decoded_labels, prediction_list=decoded_preds)

    # ipdb.set_trace()

    print(results)
    
    results.update(Metrics.evaluate_rouge(label_list=decoded_labels, prediction_list=decoded_preds))

    # print(results)
    
    
    # clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
    # results = clf_metrics.compute(predictions=[1 if "unanswerable" == x.replace("</s>", "") else 0 for x in decoded_preds], 
    #                     references=[1 if "unanswerable" == x else 0 for x in decoded_labels]
    # )

    # rouge = evaluate.load('rouge')
    
    # rouge_results = rouge.compute(
    #     predictions=[pred.replace("</s>", "") for pred in decoded_preds],
    #     references=decoded_labels
    # )
    # results.update(rouge_results) 

    # ipdb.set_trace()
    
    # result = Metrics.evaluate_property_wise_text_based(label_list=decoded_labels, prediction_list=decoded_preds)
    # result.update(Metrics.evaluate_rouge(label_list=decoded_labels, prediction_list=decoded_preds))
    return results

In [None]:
script_args.save_total_limit

In [None]:
# https://github.com/tatsu-lab/stanford_alpaca/issues/133#issuecomment-1483893538
training_args = TrainingArguments(
    output_dir=script_args.output_dir,
    per_device_train_batch_size=script_args.per_device_train_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
    learning_rate=script_args.learning_rate,
    logging_steps=script_args.logging_steps,
    # max_steps=script_args.max_steps,
    report_to=script_args.log_with,
    save_steps=script_args.save_steps,
    save_total_limit=script_args.save_total_limit,
    evaluation_strategy="steps",
    save_strategy="steps",
    # save_strategy = "no",
    eval_steps = 2,
    load_best_model_at_end=True,
    group_by_length=script_args.group_by_length,
    lr_scheduler_type=script_args.lr_scheduler_type,
    warmup_steps=script_args.num_warmup_steps,
    optim=script_args.optimizer_type,
    # bf16=True,
    # fp16=True,
    fp16=False,
    bf16=False,
    remove_unused_columns=False,
    num_train_epochs = script_args.num_train_epochs,
    run_name=script_args.run_name,
)


trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    packing=script_args.packing,
    # max_seq_length=None,
    max_seq_length=script_args.seq_length,
    # compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    args=training_args,
)

# trainer = CustomTrainer(
#     model=base_model,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     peft_config=peft_config,
#     packing=script_args.packing,
#     # max_seq_length=None,
#     max_seq_length=script_args.seq_length,
#     # compute_metrics=compute_metrics,
#     tokenizer=tokenizer,
#     args=training_args,
# )

trainer.train()
# trainer.evaluate()

In [19]:
# # 34450
# !rm -r ../model_ckpt/docteat_llama2_13b_tdms_f1_all_template/*
trainer.state.best_model_checkpoint

'../model_ckpt/long_summ_llama2_7b_tdms_f1_all_template_seq_2400/checkpoint-4'

# MODEL SAVING

In [25]:
f"{script_args.output_dir}/best_checkpoint"

'../model_ckpt/long_summ_llama2_7b_tdms_f1_all_template_seq_2400/best_checkpoint'

In [20]:
trainer.save_model(f"{script_args.output_dir}/best_checkpoint")

trainer.model.save_pretrained(f"{script_args.output_dir}/save_pretrained")

# output_dir = os.path.join(script_args.output_dir, "final_checkpoint_")
# output_dir = os.path.join(script_args.output_dir, f"{script_args.run_name}")

# trainer.model.save_pretrained(output_dir)

# # Free memory for merging weights
# del base_model
# torch.cuda.empty_cache()

# model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
# model = model.merge_and_unload()

# output_merged_dir = os.path.join(script_args.output_dir, "final_merged_checkpoint")
# model.save_pretrained(output_merged_dir, safe_serialization=True)

In [None]:
# model = AutoPeftModelForCausalLM.from_pretrained(
#     script_args.output_dir, 
#     device_map="auto", 
#     torch_dtype=torch.bfloat16,
#     offload_folder = "offload/"
# )

# # model = model.merge_and_unload()

In [None]:
# model = AutoPeftModelForCausalLM.from_pretrained(
#     # script_args.output_dir,
    
#     device_map="auto",
#     # low_cpu_mem_usage=True,
#     torch_dtype=torch.bfloat16,
#     # load_in_4bit=True,
# )

# model = model.merge_and_unload()

# output_merged_dir = os.path.join(script_args.output_dir, "final_merged_checkpoint")
# model.save_pretrained(output_merged_dir, safe_serialization=True)

# Inference

In [14]:
f"{script_args.output_dir}/{script_args.test_ckpt}"

'../model_ckpt/long_llama2_7b_tdms_f1_all_template_seq_len_3000/checkpoint-3000'

In [21]:
model = AutoPeftModelForCausalLM.from_pretrained(
    # script_args.model_name,
    # f"{script_args.output_dir}/{script_args.test_ckpt}",
    f"{script_args.output_dir}/best_checkpoint",    
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    load_in_4bit=True,
    use_auth_token="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",
    # use_auth_token="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"
)

# model.config.use_cache = False


tokenizer = AutoTokenizer.from_pretrained(
    # f"{script_args.output_dir}/{script_args.test_ckpt}",
    f"{script_args.output_dir}/best_checkpoint",
    use_auth_token="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",
)

# model = model.cpu()
# model = model.to("cpu")

# torch.cuda.empty_cache()
# # model.generate()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
# model.config.use_cache = True
script_args.test_dataset

'../data/LLLM_LONG_SUMMARIZED_TDMS_SQUAD_1/fold1'

In [22]:
dataset = DatasetDict.load_from_disk(f"{script_args.test_dataset}")
    
    # if args.streaming:
    #     print("Loading the dataset in streaming mode")
    #     valid_data = dataset.take(args.size_valid_set)
    #     train_data = dataset.skip(args.size_valid_set)
    #     train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=None)
    # else:
    
# dataset = dataset.train_test_split(test_size=0.005, seed=None)
train_data = dataset["train"].shuffle(seed=42)
valid_data = dataset["validation"].shuffle(seed=42)
# valid_data = dataset["validation"].shard(num_shards=10, index=0).shuffle(seed=42)

# train_data[0]
len(valid_data)

1298

In [23]:
idx = random.randint(0, len(valid_data))

print(f"Index: {idx}\n")

# print(f"Question: {valid_data[idx]['prompt']}")
# print("\n#################################################\n")
# print(f"Answer: {valid_data[idx]['answer']}")

Index: 1000



In [24]:
# prompt = "Who is Leonardo Da Vinci?"
prompt = f"Question: {valid_data[idx]['prompt']}"
# model_pth = f"{script_args.output_dir}/best_checkpoint"
# tokenizer = AutoTokenizer.from_pretrained(
#     f"{script_args.output_dir}/best_checkpoint",
#     use_auth_token="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",
# )

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=script_args.seq_length+100)
# pipe = pipeline(task="text-generation", model=f"{script_args.output_dir}/save_pretrained", tokenizer=tokenizer, max_length=script_args.seq_length)

# result = pipe(f"<s>[INST] {prompt} [/INST]")
result = pipe(f"{prompt}")
print(result[0]['generated_text'])

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'Peg

Question: Title:	Modified Distribution Alignment for Domain Adaptation with Pre-trained Inception ResNet

Abstract:	Deep neural networks have been widely used in computer vision. There are several well trained deep neural networks for the ImageNet classification challenge, which has played a significant role in image recognition. However, little work has explored pre-trained neural networks for image recognition in domain adaption. In this paper, we are the first to extract better-represented features from a pre-trained Inception ResNet model for domain adaptation. We then present a modified distribution alignment method for classification using the extracted features. We test our model using three benchmark datasets (Office+Caltech-10, Office-31 and Office-Home). Extensive experiments demonstrate significant improvements (4.8%, 5.5%, and 10%) in classification accuracy over the state-of-the-art.

Domain Adaptation; Pre-trained Inception ResNet; Distribution Alignment;

Results

Descri

In [25]:
print(f"Answer: {valid_data[idx]['answer']}")

Answer: [{'LEADERBOARD': {'Task': 'Domain Adaptation', 'Dataset': 'Office-31', 'Metric': 'Average Accuracy', 'Score': '89.8'}}]


In [41]:
# prompt = f"Question: {valid_data[idx]['prompt']}"

# result = pipe(f"{prompt}")
# print(result[0]['generated_text'])

In [28]:
print(f"Answer: {valid_data[idx]['answer']}")

Answer: [{'LEADERBOARD': {'Task': 'Depth Estimation', 'Dataset': 'NYU-Depth V2', 'Metric': 'RMS', 'Score': '0.407'}}, {'LEADERBOARD': {'Task': 'Monocular Depth Estimation', 'Dataset': 'KITTI Eigen split', 'Metric': 'absolute relative error', 'Score': '0.064'}}, {'LEADERBOARD': {'Task': 'Monocular Depth Estimation', 'Dataset': 'NYU-Depth V2', 'Metric': 'RMSE', 'Score': '0.392'}}]


In [None]:
f"{script_args.o/utput_dir}/{script_args.checkpoint}"

In [None]:
model = AutoPeftModelForCausalLM.from_pretrained(
    # "../model_ckpt/docteat_llama2_13b_tdms_f2_all_template/sft_docteat_llama2_13b_tdms_f2_all_Template",
    f"{script_args.output_dir}/{script_args.checkpoint}",
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    # load_in_4bit=True,
    device_map={"": 0},
    trust_remote_code=True,
    token="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",
)

model.generate()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(f"{script_args.output_dir}/{script_args.checkpoint}")

train_dataset, eval_dataset = create_datasets(tokenizer, script_args)

# for example in tqdm(iter(dataset)):


In [None]:
dataset = DatasetDict.load_from_disk(f"{script_args.dataset_name}")
    
    # if args.streaming:
    #     print("Loading the dataset in streaming mode")
    #     valid_data = dataset.take(args.size_valid_set)
    #     train_data = dataset.skip(args.size_valid_set)
    #     train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=None)
    # else:
    
# dataset = dataset.train_test_split(test_size=0.005, seed=None)
train_data = dataset["train"]
valid_data = dataset["validation"]

valid_data[0]

In [None]:
len(valid_data)

In [None]:
outputs.shape

In [None]:
DEV = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# tokenizer = AutoTokenizer.from_pretrained(script_args.output_dir)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# inputs = tokenizer.encode(f"{train_data[1]['prompt']}", return_tensors="pt").to(DEV)
inputs = tokenizer.encode(f"{valid_data[0]['prompt']}", return_tensors="pt").to(DEV)


generate_kwargs = dict(
    input_ids=inputs,
    # temperature=0.2, 
    # top_p=0.95, 
    # top_k=40,
    # max_new_tokens=1200,
    # repetition_penalty=1.3
)
outputs = model.generate(**generate_kwargs)
print(tokenizer.decode(outputs[0]))

In [None]:
tokenizer.decode(outputs[0]).split("Answer:")[-1].replace("</s>", "").strip()

In [None]:
valid_data[0:3]['answer']

## The parameters you've provided—`temperature`, `top_p`, `top_k`, `max_new_tokens`, and `repetition_penalty`—are hyperparameters that control the generation behavior of a large language model such as those from the GPT series.

Here's a brief description of each:

1. **temperature**: Adjusts the "sharpness" of the distribution of the next-token probabilities. Lower values (e.g., 0.2) make the outputs more deterministic (i.e., more focused on high-probability outcomes), while higher values make it more random.

2. **top_p (nucleus sampling)**: Instead of sampling from the full distribution, it samples from the smallest set of words whose cumulative probability exceeds the value of `top_p`. This can lead to more dynamic and unpredictable outputs.

3. **top_k**: Limits the set of tokens considered for generation to the top k probabilities. This can avoid very rare words that can sometimes disrupt the coherence of generated text.

4. **max_new_tokens**: The maximum number of tokens to generate in the response. It essentially controls the length of the generated output.

5. **repetition_penalty**: If set to a value > 1.0, it penalizes already generated tokens, making repetitions less likely. Conversely, values < 1.0 make repetitions more likely.

For "better" generation, it depends on your specific use case:

- **For more deterministic, focused outputs**: Lower `temperature` (e.g., 0.2), reasonable `top_k` (like 40), and `top_p` closer to 1 (like 0.95) work well.

- **For more creative, diverse outputs**: Increase `temperature` (e.g., 0.7 or higher), use a more liberal `top_k` value or rely more on `top_p` (like setting it to 0.85 or so).

- **To control verbosity**: Adjust `max_new_tokens`. Setting it to 1200 will allow for very long outputs. If you want shorter outputs, reduce this value.

- **To control repetitiveness**: Use the `repetition_penalty`. A value of 1.3, as you've set, should help reduce repetitiveness. Adjust as needed.

It's also a good practice to experiment and fine-tune these hyperparameters based on the specific outputs you're looking for and your subjective judgment of the quality. If possible, gather feedback from users or subject matter experts to refine these settings further.