In [1]:
# imports
import pandas as pd
import os, ipdb
import random, evaluate

import random
import string

# Fine-Tune Llama2-7b on custom dataset
import os, ipdb
from dataclasses import dataclass, field
from typing import Optional
import numpy as np
import torch, random
from datasets import DatasetDict, Dataset, load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from tqdm import tqdm
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer\
, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, TrainerCallback, pipeline

from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset

2024-02-15 06:39:11.150062: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-15 06:39:11.206584: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# from ../evaluation_metrics import Metrics
seed = 42
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

from evaluation_metrics import Metrics, THRESHOLD

os.environ["TOKENIZERS_PARALLELISM"] = "false" # or "true", depending on your needs

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the model name"})
    log_with: Optional[str] = field(default="wandb", metadata={"help": "use 'wandb' to log with wandb"})

    dataset_name: Optional[str] = field(default="lvwerra/stack-exchange-paired", metadata={"help": "the dataset name"})
    subset: Optional[str] = field(default="data/finetune", metadata={"help": "the subset to use"})
    split: Optional[str] = field(default="train", metadata={"help": "the split to use"})
    size_valid_set: Optional[int] = field(default=4000, metadata={"help": "the size of the validation set"})
    streaming: Optional[bool] = field(default=True, metadata={"help": "whether to stream the dataset"})
    shuffle_buffer: Optional[int] = field(default=5000, metadata={"help": "the shuffle buffer size"})
    seq_length: Optional[int] = field(default=1024, metadata={"help": "the sequence length"})
    num_workers: Optional[int] = field(default=4, metadata={"help": "the number of workers"})

    max_steps: Optional[int] = field(default=500, metadata={"help": "the maximum number of sgd steps"})
    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
    save_steps: Optional[int] = field(default=10, metadata={"help": "the saving frequency"})
    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "the per device train batch size"})
    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "the per device eval batch size"})
    gradient_accumulation_steps: Optional[int] = field(default=2, metadata={"help": "the gradient accumulation steps"})
    gradient_checkpointing: Optional[bool] = field(
        default=True, metadata={"help": "whether to use gradient checkpointing"}
    )
    group_by_length: Optional[bool] = field(default=False, metadata={"help": "whether to group by length"})
    packing: Optional[bool] = field(default=True, metadata={"help": "whether to use packing for SFTTrainer"})

    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})

    learning_rate: Optional[float] = field(default=1e-4, metadata={"help": "the learning rate"})
    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
    num_warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})

    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})


parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses([])[0]

if script_args.group_by_length and script_args.packing:
    raise ValueError("Cannot use both packing and group by length")

def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""
    text = f"Question: {example['prompt']}\n\nAnswer: {example['answer']}"
    # text = f"{example['prompt']}\n{example['answer']}"
    return text
    



In [6]:

script_args.size = "7b"
script_args.seq_length = 2400

script_args.save_total_limit = 10
script_args.per_device_train_batch_size = 6
script_args.gradient_accumulation_steps = 2

script_args.test_ckpt = "checkpoint-30000" # "checkpoint-5000"

# script_args.test_dataset = "../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_50_PERCENT/fold1"
# script_args.dataset_name = "../data/LLLM_AUGMENTED_SUMMARIZED_WITH_ID_ZEROSHOT_TDMS_50_PERCENT/fold1"

script_args.test_dataset = "../data/LLLM_AUGMENTED_SUMMARIZED_ZEROSHOT_TDMS_50_PERCENT_LONG/fold1"
script_args.dataset_name = "../data/LLLM_AUGMENTED_SUMMARIZED_ZEROSHOT_TDMS_50_PERCENT_LONG/fold1"

# script_args.test_dataset = "./data/LLLM_AUGMENTED_SUMMARIZED_ZEROSHOT_TDMS_50_PERCENT_DOCTEAT/fold1"
# script_args.dataset_name = "./data/LLLM_AUGMENTED_SUMMARIZED_ZEROSHOT_TDMS_50_PERCENT_DOCTEAT/fold1"


script_args.model_name = "meta-llama/Llama-2-7b-hf"
script_args.output_dir = f"../model_ckpt/augmented_summ_with_id_zeroshot_llama2_{script_args.size}_tdms_f1_50_percent_seq_len_{script_args.seq_length}"
script_args.run_name = f"eval_sft_augmented_summ_with_id_zeroshot_llama2_{script_args.size}_tdms_50_percent_seq_len_{script_args.seq_length}"


# script_args.model_name = "mistralai/Mistral-7B-v0.1"
# script_args.output_dir = f"../model_ckpt/augmented_summ_with_id_zeroshot_mistralai_{script_args.size}_tdms_f1_50_percent_seq_len_{script_args.seq_length}"
# script_args.run_name = f"eval_sft_augmented_summ_with_id_zeroshot_mistralai_{script_args.size}_tdms_50_percent_seq_len_{script_args.seq_length}"



script_args.per_device_train_batch_size = 3
script_args.gradient_accumulation_steps = 2
script_args.per_device_eval_batch_size = 2


script_args.save_steps = 1000
script_args.eval_steps = 1000
script_args.evaluation_strategy = 1000
script_args.logging_steps = 1000
script_args.streaming = False
script_args.num_train_epochs = 5
script_args.save_total_limit = 50

script_args.random_test_sub = 500

script_args.save_strategy = "steps" #"epoch"
script_args.evaluation_strategy= "steps" #"epoch",

mode = "validation"
# mode = "zeroshot"

In [7]:
model = AutoPeftModelForCausalLM.from_pretrained(
        f"{script_args.output_dir}/{script_args.test_ckpt}",
        low_cpu_mem_usage=True,
        torch_dtype=torch.bfloat16,
        load_in_4bit=True,
        use_auth_token="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",
    )

model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(
        f"{script_args.output_dir}/{script_args.test_ckpt}",
        use_auth_token="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",
    )
  
dataset = DatasetDict.load_from_disk(f"{script_args.test_dataset}")
    
valid_data = dataset[mode].shuffle(seed=42)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
labels = []
preds = []
idx_skip = []
processed_paper = []
# template_selected = "drop_1"
i = 0
# array(['squad_1', 'squad_2', 'squad_3', 'squad_4', 'squad_5', 'squad_6',
#        'squad_7', 'squad_8', 'drop_1', 'drop_2', 'drop_3', 'drop_4',
#        'drop_5', 'drop_6', 'drop_7'], dtype=object)


for idx, valid_ex in tqdm(enumerate(valid_data), total=len(valid_data)):

    if valid_ex['id'] in processed_paper :
        continue
        
    
    prompt = f"Question: {valid_ex['prompt']}"

    if len(prompt.split()) < 2400 :
        continue
    
    inputs = tokenizer.encode(prompt, max_length=5500, truncation=True, return_tensors="pt").to(device)
    # inputs_ = tokenizer.encode(prompt, return_tensors="pt").to(device)
    
    # if inputs.shape[1] > 5010:
    #     # ipdb.set_trace()
    #     print(f"Validation index {idx} skipped because input.shape: {inputs.shape}, input split length: {len(valid_ex['prompt'].split())}")
    #     idx_skip.append((inputs.shape[1], len(valid_ex['prompt'].split())))
    #     continue 

    # if len(processed_paper) ==  1582:
    #     break
        
    # if inputs.shape[1] > 5000:
    
    # print(f"inputs.shape: {inputs.shape}, input split length: {len(valid_ex['prompt'].split())}")

    generate_kwargs = dict(
        input_ids=inputs,
        max_new_tokens=256,
        pad_token_id=tokenizer.eos_token_id
    )
    
    outputs = model.generate(**generate_kwargs)
    predictions = tokenizer.decode(outputs[0])
    
    preds.append(predictions.split("Answer: ")[-1].replace("</s>", ""))
    labels.append(valid_ex['answer'])

    i+=1
    
    # ipdb.set_trace()
    if i == 10:
        break
    
    processed_paper.append(valid_ex['id'])
    # idx_skip.append((inputs.shape[1], len(valid_ex['prompt'].split())))
        
        # ipdb.set_trace()
        
        # if len(processed_paper)>= 20:
        #     break
            
    # if idx >= 20 :
    #     break
        
        # if idx == len(valid_data)//2 :
        #     results = Metrics.evaluate_property_wise_json_based(label_list=labels, prediction_list=preds)
        #     results.update(Metrics.evaluate_rouge(label_list=labels, prediction_list=preds))
        #     print(f"Intermediate Results:")
        #     for key, value in results.items():
        #         print(f"{key}: {value}")

  0%|          | 0/11725 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.61 GiB. GPU 0 has a total capacty of 23.69 GiB of which 2.66 GiB is free. Including non-PyTorch memory, this process has 21.02 GiB memory in use. Of the allocated memory 14.98 GiB is allocated by PyTorch, and 5.73 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [9]:
len(processed_paper)

9

In [7]:
# len_missed = [x[0] for x in idx_skip]
# len_missed[:2]

[]

In [18]:
# np.min(len_missed)

In [None]:
# np.max(len_missed)

In [19]:
# np.mean(len_missed)

In [20]:
# q

In [None]:

Llama 
===============
len(processed_paper)
1479


len_missed = [x[1] for x in idx_skip]
len_missed[:2]
[3999, 3400]
np.min(len_missed)
1928
np.max(len_missed)
25557
np.mean(len_missed)
3784.1600985221676

Mistral 
===============



In [9]:
# from IPython.paths import get_ipython_dir
# import os
# print(os.path.join(get_ipython_dir(), 'profile_default'))

len(preds)
# len(labels)

10

In [17]:
# preds[0]

In [11]:
labels[:2]

['unanswerable',
 "[{'LEADERBOARD': {'Task': 'Semantic Segmentation', 'Dataset': 'Cityscapes test', 'Metric': 'Mean IoU (class)', 'Score': '77.8%'}}, {'LEADERBOARD': {'Task': 'Semantic Segmentation', 'Dataset': 'ADE20K val', 'Metric': 'mIoU', 'Score': '43.68'}}, {'LEADERBOARD': {'Task': 'Semantic Segmentation', 'Dataset': 'ADE20K', 'Metric': 'Validation mIoU', 'Score': '43.68'}}]"]

In [10]:
results = Metrics.evaluate_property_wise_json_based(label_list=labels, prediction_list=preds)
results.update(Metrics.evaluate_rouge(label_list=labels, prediction_list=preds))

print(f"Test data {script_args.test_dataset}")
print(f"Test ckpt {script_args.test_ckpt}")
print(f"Partial THRESHOLD {THRESHOLD}")
print(f"Total index skipped {len(idx_skip)}")
print(f"Index skipped {idx_skip}")

print("##################################################################################")
print(f"Results:")
for key, value in results.items():
    print(f"{key}: {value}")
print("##################################################################################")

Test data ../data/LLLM_AUGMENTED_SUMMARIZED_ZEROSHOT_TDMS_50_PERCENT_LONG/fold1
Test ckpt checkpoint-30000
Partial THRESHOLD 50
Total index skipped 0
Index skipped []
##################################################################################
Results:
general_accuracy: 40.0
exact_recalls_task: 0.0
exact_recalls_dataset: 0.0
exact_recalls_metric: 0.0
exact_recalls_Score: 0.0
exact_recalls_overall: 0.0
partial_recalls_task: 0.0
partial_recalls_dataset: 0.0
partial_recalls_metric: 0.0
partial_recalls_Score: 0.0
partial_recalls_overall: 0.0
exact_precisions_task: 0
exact_precisions_dataset: 0
exact_precisions_metric: 0
exact_precisions_Score: 0
exact_precisions_overall: 0.0
partial_precisions_task: 0
partial_precisions_dataset: 0
partial_precisions_metric: 0
partial_precisions_Score: 0
partial_precisions_overall: 0.0
exact_f1s_task: 0.0
exact_f1s_dataset: 0.0
exact_f1s_metric: 0.0
exact_f1s_Score: 0.0
exact_f1s_overall: 0.0
partial_f1s_task: 0.0
partial_f1s_dataset: 0.0
partial_f1s_