In [1]:
# !pip install evaluate
# !pip install accelerate
# !pip install bitsandbytes
# !pip install -i https://test.pypi.org/simple/ bitsandbytes

# Fetching Datasets

In [2]:
#imports
import pandas as pd
import os, ipdb
import random, evaluate

import random
import string

# Fine-Tune Llama2-7b on custom dataset
import os, ipdb
from dataclasses import dataclass, field
from typing import Optional
import numpy as np
import torch, random
from datasets import DatasetDict, Dataset, load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from tqdm import tqdm
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer\
, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, TrainerCallback, pipeline

from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset


# from ../evaluation_metrics import Metrics
seed = 42
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

from evaluation_metrics import Metrics, THRESHOLD

os.environ["TOKENIZERS_PARALLELISM"] = "false" # or "true", depending on your needs

# pd.options.display.max_rows , pd.options.display.max_columns  = 100,100  

device = 'cuda' if torch.cuda.is_available() else "cpu"
device

2024-01-01 00:17:52.042191: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-01 00:17:52.095656: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'cuda'

# Inference

In [3]:
@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the model name"})
    log_with: Optional[str] = field(default="wandb", metadata={"help": "use 'wandb' to log with wandb"})

    dataset_name: Optional[str] = field(default="lvwerra/stack-exchange-paired", metadata={"help": "the dataset name"})
    subset: Optional[str] = field(default="data/finetune", metadata={"help": "the subset to use"})
    split: Optional[str] = field(default="train", metadata={"help": "the split to use"})
    size_valid_set: Optional[int] = field(default=4000, metadata={"help": "the size of the validation set"})
    streaming: Optional[bool] = field(default=True, metadata={"help": "whether to stream the dataset"})
    shuffle_buffer: Optional[int] = field(default=5000, metadata={"help": "the shuffle buffer size"})
    seq_length: Optional[int] = field(default=1024, metadata={"help": "the sequence length"})
    num_workers: Optional[int] = field(default=4, metadata={"help": "the number of workers"})

    max_steps: Optional[int] = field(default=500, metadata={"help": "the maximum number of sgd steps"})
    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
    save_steps: Optional[int] = field(default=10, metadata={"help": "the saving frequency"})
    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "the per device train batch size"})
    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "the per device eval batch size"})
    gradient_accumulation_steps: Optional[int] = field(default=2, metadata={"help": "the gradient accumulation steps"})
    gradient_checkpointing: Optional[bool] = field(
        default=True, metadata={"help": "whether to use gradient checkpointing"}
    )
    group_by_length: Optional[bool] = field(default=False, metadata={"help": "whether to group by length"})
    packing: Optional[bool] = field(default=True, metadata={"help": "whether to use packing for SFTTrainer"})

    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})

    learning_rate: Optional[float] = field(default=1e-4, metadata={"help": "the learning rate"})
    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
    num_warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})

    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})


parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses([])[0]

if script_args.group_by_length and script_args.packing:
    raise ValueError("Cannot use both packing and group by length")

In [4]:
# 2000/3

In [5]:
script_args.model_name = "meta-llama/Llama-2-7b-hf"
script_args.size = "7b"
script_args.seq_length = 2400

script_args.save_total_limit = 10
script_args.per_device_train_batch_size = 6
script_args.gradient_accumulation_steps = 2

script_args.test_ckpt = "checkpoint-76000"
i = 1
script_args.test_dataset = f"../data/LLLM_LONG_SUMMARIZED_TDMS_SQUAD_{i}/fold1"
script_args.dataset_name = "../data/LLLM_LONG_SUMMARIZED_TDMS_ALL_TEMPLATE/fold1"
script_args.output_dir = f"../model_ckpt/long_summ_llama2_{script_args.size}_tdms_f1_all_template_seq_len_{script_args.seq_length}"
script_args.run_name = f"eval_long_summ_llama2_{script_args.size}_tdms_f1_all_template_seq_len_{script_args.seq_length}"

script_args.per_device_train_batch_size = 3
script_args.gradient_accumulation_steps = 2
script_args.per_device_eval_batch_size = 2


script_args.save_steps = 1000
script_args.eval_steps = 1000
script_args.evaluation_strategy = 1000
script_args.logging_steps = 1000
script_args.streaming = False
script_args.num_train_epochs = 5
script_args.save_total_limit = 50

script_args.random_test_sub = 500

script_args.save_strategy = "steps" #"epoch"
script_args.evaluation_strategy= "steps" #"epoch",

In [6]:
model = AutoPeftModelForCausalLM.from_pretrained(
    # script_args.model_name,
    f"{script_args.output_dir}/{script_args.test_ckpt}",
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    load_in_4bit=True,
    use_auth_token="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",
    # use_auth_token="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"
)

model.config.use_cache = False


tokenizer = AutoTokenizer.from_pretrained(
    f"{script_args.output_dir}/{script_args.test_ckpt}",
    use_auth_token="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",
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [7]:
model.generation_config

GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_length": 4096,
  "pad_token_id": 0,
  "temperature": 0.6,
  "top_p": 0.9
}

In [8]:
# for i in range(1, 8):
#     script_args.test_dataset = f"./data/LLLM_DOCTEAT_TDMS_DROP_{i}/fold1"
for i in range(1, 2):
    script_args.test_dataset = f"../data/LLLM_LONG_SUMMARIZED_TDMS_SQUAD_{i}/fold1"    
    
    dataset = DatasetDict.load_from_disk(f"{script_args.test_dataset}")
        
    valid_data = dataset["validation"].shuffle(seed=42)

    labels = []
    preds = []

    for i, valid_ex in tqdm(enumerate(valid_data), total=len(valid_data)):
    
        prompt = f"Question: {valid_ex['prompt']}"
        
        inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

        if inputs.shape[1] >= 8000:
            print(f"Validation index {i} skipped because input.shape: {inputs.shape}, input split length: {len(valid_ex['prompt'].split())}")
            continue 

        print(f"inputs.shape: {inputs.shape}, input split length: {len(valid_ex['prompt'].split())}")
        
        # ipdb.set_trace()
        
        generate_kwargs = dict(
            input_ids=inputs,
            max_new_tokens=250,
            pad_token_id=tokenizer.eos_token_id
        )
        
        # outputs = model.generate(**generate_kwargs)
        # outputs = model.generate(**generate_kwargs)
        # outputs = model.generate(**inputs, max_new_tokens=250, pad_token_id=tokenizer.eos_token_id)
        outputs = model.generate(**inputs)
        
        predictions = tokenizer.decode(outputs[0])
        
        preds.append(predictions.split("Answer: ")[-1].replace("</s>", ""))
        labels.append(valid_ex['answer'])
        
        if i == len(valid_data)//2 :
            results = Metrics.evaluate_property_wise_json_based(label_list=labels, prediction_list=preds)
            results.update(Metrics.evaluate_rouge(label_list=labels, prediction_list=preds))
            print(f"Intermediate Results:")
            for key, value in results.items():
                print(f"{key}: {value}")
        
        if i >= 100:
            break
    
    results = Metrics.evaluate_property_wise_json_based(label_list=labels, prediction_list=preds)
    results.update(Metrics.evaluate_rouge(label_list=labels, prediction_list=preds))
    
    print(f"Test data {script_args.test_dataset}")
    print(f"Test ckpt {script_args.test_ckpt}")
    print(f"Partial THRESHOLD {THRESHOLD}")
    print("##################################################################################")
    print(f"Results:")
    for key, value in results.items():
        print(f"{key}: {value}")
    print("##################################################################################")

  0%|          | 0/1298 [00:00<?, ?it/s]


inputs.shape: torch.Size([1, 2999]), input split length: 1510


TypeError: generate() argument after ** must be a mapping, not Tensor

In [None]:
len(valid_data[7]['prompt'].split())

In [None]:
# # for i in range(1, 8):
# #     script_args.test_dataset = f"./data/LLLM_DOCTEAT_TDMS_DROP_{i}/fold1"
# for i in range(1, 2):
#     script_args.test_dataset = f"../data/LLLM_LONG_SUMMARIZED_TDMS_SQUAD_{i}/fold1"

    
#     dataset = DatasetDict.load_from_disk(f"{script_args.test_dataset}")
        
#     # train_data = dataset["train"].shuffle(seed=42)
#     valid_data = dataset["validation"].shuffle(seed=42)

#     labels = []
#     preds = []

#     # # Randomly select 500 indices from valid_data
#     # selected_indices = random.sample(range(len(valid_data)), script_args.random_test_sub)
    
#     # # Create a new list of selected examples
#     # selected_valid_data = [valid_data[i] for i in selected_indices]
    
#     # for i, valid_ex in tqdm(enumerate(selected_valid_data), total=script_args.random_test_sub):
#     for i, valid_ex in tqdm(enumerate(valid_data), total=len(valid_data)):
        
#         prompt = f"Question: {valid_ex['prompt']}"
        
#         inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

#         generate_kwargs = dict(
#             input_ids=inputs,
#             # do_sample=False,
#             # temperature=0.7, 
#             # top_p=0.9, 
#             # top_k=40,
#             max_new_tokens=256,
#             # max_new_tokens=script_args.seq_length+100,
#             pad_token_id=tokenizer.eos_token_id
#             # repetition_penalty=1.2
#         )
        
#         # generate_kwargs = dict(
#         #     input_ids=inputs,
#         #     temperature=0.5, 
#         #     top_p=1.0, 
#         #     # top_p=0.9, 
#         #     top_k=1000,
#         #     max_new_tokens=4000,
#         #     repetition_penalty=1.0
#         # )

#         outputs = model.generate(**generate_kwargs)
#         predictions = tokenizer.decode(outputs[0])

#         pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=script_args.seq_length+100)
        
#         # # result = pipe(f"<s>[INST] {prompt} [/INST]")
#         # result = pipe(f"{prompt}")
#         # # print(result[0]['generated_text'])
#         # predictions = result[0]['generated_text']

#         # ipdb.set_trace()
#         preds.append(predictions.split("Answer: ")[-1].replace("</s>", ""))
#         labels.append(valid_ex['answer'])
        
#         if i == len(valid_data)//2 :
#             results = Metrics.evaluate_property_wise_json_based(label_list=labels, prediction_list=preds)
#             results.update(Metrics.evaluate_rouge(label_list=labels, prediction_list=preds))
#             print(f"Intermediate Results:")
#             for key, value in results.items():
#                 print(f"{key}: {value}")

#         if i >= 10:
#             break
    
#     results = Metrics.evaluate_property_wise_json_based(label_list=labels, prediction_list=preds)
#     results.update(Metrics.evaluate_rouge(label_list=labels, prediction_list=preds))
    
#     print(f"Test data {script_args.test_dataset}")
#     print(f"Test ckpt {script_args.test_ckpt}")
#     print(f"Partial THRESHOLD {THRESHOLD}")
#     print("##################################################################################")
#     print(f"Results:")
#     for key, value in results.items():
#         print(f"{key}: {value}")
#     print("##################################################################################")