In [1]:
# !pip install evaluate
# !pip install accelerate
# !pip install bitsandbytes
# !pip install -i https://test.pypi.org/simple/ bitsandbytes

# Fetching Datasets

In [14]:
#imports
import pandas as pd
import os, ipdb, re
import random, evaluate

import random
import string

# Fine-Tune Llama2-7b on custom dataset
import os, ipdb
from dataclasses import dataclass, field
from typing import Optional
import numpy as np
import torch, random
from datasets import DatasetDict, Dataset, load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from tqdm import tqdm
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer\
, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, TrainerCallback

from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset


# from ../evaluation_metrics import Metrics
seed = 42
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

from evaluation_metrics import Metrics, THRESHOLD

os.environ["TOKENIZERS_PARALLELISM"] = "false" # or "true", depending on your needs

# pd.options.display.max_rows , pd.options.display.max_columns  = 100,100  

device = 'cuda' if torch.cuda.is_available() else "cpu"
device

'cuda'

# Inference

In [3]:
@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the model name"})
    log_with: Optional[str] = field(default="wandb", metadata={"help": "use 'wandb' to log with wandb"})

    dataset_name: Optional[str] = field(default="lvwerra/stack-exchange-paired", metadata={"help": "the dataset name"})
    subset: Optional[str] = field(default="data/finetune", metadata={"help": "the subset to use"})
    split: Optional[str] = field(default="train", metadata={"help": "the split to use"})
    size_valid_set: Optional[int] = field(default=4000, metadata={"help": "the size of the validation set"})
    streaming: Optional[bool] = field(default=True, metadata={"help": "whether to stream the dataset"})
    shuffle_buffer: Optional[int] = field(default=5000, metadata={"help": "the shuffle buffer size"})
    seq_length: Optional[int] = field(default=1024, metadata={"help": "the sequence length"})
    num_workers: Optional[int] = field(default=4, metadata={"help": "the number of workers"})

    max_steps: Optional[int] = field(default=500, metadata={"help": "the maximum number of sgd steps"})
    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
    save_steps: Optional[int] = field(default=10, metadata={"help": "the saving frequency"})
    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "the per device train batch size"})
    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "the per device eval batch size"})
    gradient_accumulation_steps: Optional[int] = field(default=2, metadata={"help": "the gradient accumulation steps"})
    gradient_checkpointing: Optional[bool] = field(
        default=True, metadata={"help": "whether to use gradient checkpointing"}
    )
    group_by_length: Optional[bool] = field(default=False, metadata={"help": "whether to group by length"})
    packing: Optional[bool] = field(default=True, metadata={"help": "whether to use packing for SFTTrainer"})

    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})

    learning_rate: Optional[float] = field(default=1e-4, metadata={"help": "the learning rate"})
    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
    num_warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})

    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})


parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses([])[0]

if script_args.group_by_length and script_args.packing:
    raise ValueError("Cannot use both packing and group by length")

In [4]:
# ckpt = "/checkpoint-4350"
script_args.model_name = "meta-llama/Llama-2-13b-hf"
script_args.size = "13b"
# script_args.seq_length = 2400


# script_args.model_name = "meta-llama/Llama-2-7b-hf"
# script_args.size = "7b"

# script_args.seq_length = 512

# docteat_llama2_13b_tdm_f1_all_template_seq_len_1024
# script_args.dataset_name = "../data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/fold2"
# script_args.output_dir = f"../model_ckpt/docteat_tdm_f2_all_template{ckpt}"
# # script_args.output_dir = f"../model_ckpt{ckpt}"
# script_args.run_name = "sft_llama2_docteat_tdm_f2_all_Template"
i = 1
script_args.test_dataset = f"../data/LLLM_LONG_TDMS_SQUAD_{i}/fold1"
script_args.test_ckpt = "best_checkpoint"
# script_args.test_ckpt = "checkpoint-37500"
# script_args.test_ckpt = "checkpoint-35500"

script_args.seq_length = 1024
script_args.dataset_name = "../data/LLLM_DOCTEAT_TDMS_ALL_TEMPLATE/fold1"
script_args.output_dir = f"../model_ckpt/docteat_llama2_{script_args.size}_tdms_f1_all_template_seq_len_{script_args.seq_length}"
script_args.run_name = f"Evaluate_sft_docteat_llama2_{script_args.size}_tdms_f1_all_template_seq_len_{script_args.seq_length}"
script_args.per_device_train_batch_size = 6
script_args.gradient_accumulation_steps = 2
# # multi GPU

# script_args.test_ckpt = "checkpoint-1000"
# # script_args.test_ckpt = "checkpoint-14500"
# script_args.seq_length = 2400
# script_args.dataset_name = "../data/LLLM_LONG_TDMS_ALL_TEMPLATE/fold1"
# script_args.output_dir = f"../model_ckpt/long_llama2_{script_args.size}_tdms_f1_all_template_seq_len_{script_args.seq_length}"
# script_args.run_name = f"Evaluate_sft_long_llama2_{script_args.size}_tdms_f1_all_template_seq_len_{script_args.seq_length}"

# script_args.per_device_train_batch_size = 6
# script_args.gradient_accumulation_steps = 2

# script_args.output_dir = f"../model_ckpt/long_tdms_f1_all_template{ckpt}"
# script_args.run_name = "sft_llama2_long_tdms_f1_all_Template"
# script_args.seq_length = 2400
# script_args.per_device_train_batch_size = 1
# script_args.gradient_accumulation_steps = 1

script_args.save_steps = 50
script_args.logging_steps = 50
script_args.streaming = False
script_args.num_train_epochs = 5
script_args.save_total_limit = 10

In [5]:
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


# def print_trainable_parameters(model):
#     """
#     Prints the number of trainable parameters in the model.
#     """
#     trainable_params = 0
#     all_param = 0
#     for _, param in model.named_parameters():
#         all_param += param.numel()
#         if param.requires_grad:
#             trainable_params += param.numel()
#     print(
#         f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
#     )


def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""
    text = f"Question: {example['prompt']}\n\nAnswer: {example['answer']}"
    # text = f"{example['prompt']}\n{example['answer']}"
    return text

# def compute_metrics(eval_preds):
    
#     preds, labels = eval_preds
#     # ipdb.set_trace()
#     if isinstance(preds, tuple):
#         preds = preds[0]

#     ipdb.set_trace()
#     preds = np.where(preds != -100, preds, tokenizer.pad_token_id)  # type: ignore
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     # Replace -100 in the labels as we can't decode them.
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)  # type: ignore
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     results = Metrics.evaluate_property_wise_json_based(label_list=decoded_labels, prediction_list=decoded_preds)

#     # ipdb.set_trace()

#     # print(results)
    
#     results.update(Metrics.evaluate_rouge(label_list=decoded_labels, prediction_list=decoded_preds))

#     # print(results)
    
    
#     return results
    
def create_datasets(tokenizer, args):
    # dataset = load_dataset(
    #     args.dataset_name,
    #     data_dir=args.subset,
    #     split=args.split,
    #     use_auth_token=True,
    #     num_proc=args.num_workers if not args.streaming else None,
    #     streaming=args.streaming,
    # )
    
    dataset = DatasetDict.load_from_disk(f"{args.test_dataset}")
    dataset = dataset.shuffle(seed=seed)
    
    # if args.streaming:
    #     print("Loading the dataset in streaming mode")
    #     valid_data = dataset.take(args.size_valid_set)
    #     train_data = dataset.skip(args.size_valid_set)
    #     train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=None)
    # else:
    
    # dataset = dataset.train_test_split(test_size=0.005, seed=None)
    train_data = dataset["train"]
    valid_data = dataset["validation"]
    print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

    chars_per_token = chars_token_ratio(train_data, tokenizer, nb_examples=400)
    # chars_per_token = chars_token_ratio(train_data, tokenizer, nb_examples=len(train_data)//2)
    # 3.70
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=prepare_sample_text,
        infinite=True,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=prepare_sample_text,
        infinite=False,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset


# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )

In [6]:
script_args.output_dir

'../model_ckpt/docteat_llama2_13b_tdms_f1_all_template_seq_len_1024'

In [7]:
f"{script_args.output_dir}/{script_args.test_ckpt}"

'../model_ckpt/docteat_llama2_13b_tdms_f1_all_template_seq_len_1024/best_checkpoint'

In [8]:
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )

# base_model = AutoModelForCausalLM.from_pretrained(
#     script_args.model_name,
#     quantization_config=bnb_config,
#     device_map={"": 0},
#     trust_remote_code=True,
#     token="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",
#     # use_auth_token=True,
# )

# base_model.config.use_cache = False

In [9]:
model = AutoPeftModelForCausalLM.from_pretrained(
    # script_args.model_name,
    f"{script_args.output_dir}/{script_args.test_ckpt}",
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    load_in_4bit=True,
    use_auth_token="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",
    # use_auth_token="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"
)
# # model = model.cpu()
# model = model.to("cpu")

# # torch.cuda.empty_cache()
# # # model.generate()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
# script_args.test_dataset = f"../data/LLLM_LONG_TDMS_SQUAD_{i}/fold1"
script_args.test_dataset = f"../data/LLLM_DOCTEAT_TDMS_SQUAD_{i}/fold1"

In [11]:
dataset = DatasetDict.load_from_disk(f"{script_args.test_dataset}")
    
    # if args.streaming:
    #     print("Loading the dataset in streaming mode")
    #     valid_data = dataset.take(args.size_valid_set)
    #     train_data = dataset.skip(args.size_valid_set)
    #     train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=None)
    # else:
    
# dataset = dataset.train_test_split(test_size=0.005, seed=None)
train_data = dataset["train"].shuffle(seed=42)
valid_data = dataset["validation"].shuffle(seed=42)
# valid_data = dataset["validation"].shard(num_shards=10, index=0).shuffle(seed=42)

# train_data[0]
len(valid_data)

2353

In [20]:
sample_data = valid_data[10]
# sample_data = train_data[100]
sample_data

{'prompt': '007: Democratically Finding The Cause of Packet Drops (Extended Version) Network failures continue to plague datacenter operators as their symptoms may not have direct correlation with where or why they occur. We introduce 007, a lightweight, always-on diagnosis application that can find problematic links and also pinpoint problems for each TCP connection. 007 is completely contained within the end host. During its two month deployment in a tier-1 datacenter, it detected every problem found by previously deployed monitoring tools while also finding the sources of other problems previously undetected. We next evaluate 007 on the more realistic environment of a test cluster with 10 ToRs and a total of 80 links\n\nPlease answer a question about this article. If the question is unanswerable, say "unanswerable". What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, metric, and score?',
 'answer': "[{'

In [21]:
print(re.search(f"Cross-Modal Retrieval", sample_data['prompt']))

from fuzzywuzzy import fuzz
similarity = fuzz.ratio("Data-to-Text Generation", "Text Generation")
similarity

None


79

In [22]:
# df_valid_data = valid_data.to_pandas()
# df_valid_data.tail()

In [23]:
# str(df_valid_data.at[35275, 'prompt'])

In [24]:
# str(df_valid_data.at[35275, 'prompt'])
script_args.seq_length

1024

When generating text using models like GPT-3 or GPT-4 from OpenAI, several hyperparameters can be tweaked to influence the output. Here's a breakdown of the parameters you mentioned:

1. **`temperature`**: 
    - **Purpose**: Adjusts the randomness of the model's outputs.
    - **Values**:
        - Closer to 0: The model will be more deterministic and more likely to produce the most probable next word at each step.
        - Closer to 1: The model's outputs become more random.
    - **`temperature=0.2`**: In this case, the model's outputs will be more deterministic and confident. There's less randomness.

2. **`top_p` (also known as "nucleus sampling")**:
    - **Purpose**: Prunes the vocabulary before sampling the next word.
    - **Values**:
        - At `top_p=1.0`: Use all words in the vocabulary.
        - At `top_p=0.95`: Use only the smallest set of words such that their cumulative probability exceeds 0.95.
    - **`top_p=0.95`**: The model will only consider the top words that have a cumulative probability of 95%. It helps in reducing the chance of very random words appearing in the generated text.

3. **`top_k`**:
    - **Purpose**: Restricts the model's prediction to the top `k` most likely next words.
    - **Values**: Larger values make outputs more random, while smaller values make it less random.
    - **`top_k=40`**: The model will only consider the top 40 words for its next word prediction. It's another way to reduce randomness, but it's typically used in conjunction with `top_p` for better results.

4. **`max_new_tokens`**:
    - **Purpose**: Limits the length of the generated output.
    - **`max_new_tokens=script_args.seq_length`**: The generated output will be limited to the length specified by `script_args.seq_length`. It ensures that the model doesn't generate exceedingly long responses.

5. **`repetition_penalty`**:
    - **Purpose**: Penalizes words that are already seen in the output, especially if repeated multiple times.
    - **Values**:
        - Equal to 1: No penalty applied.
        - Greater than 1: Apply a penalty.
    - **`repetition_penalty=1.0`**: In this case, no penalty is applied for repeated words.

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'

tokenizer = AutoTokenizer.from_pretrained(
    f"{script_args.output_dir}/{script_args.test_ckpt}",
    use_auth_token="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",
)

# tokenizer.cpu()

# tokenizer = AutoTokenizer.from_pretrained(model_name)


# inputs = tokenizer.encode(f"{train_data[100]['prompt'][-7000:]}", return_tensors="pt").to(device)
# inputs = tokenizer.encode(f"{train_data[0]['prompt']}", return_tensors="pt").to(device)
inputs = tokenizer.encode(f"Question: {sample_data['prompt']}", return_tensors="pt").to(device)

# text = f"Question: {example['prompt']}\n\nAnswer: {example['answer']}"
# generate_kwargs = dict(
#     input_ids=inputs,
#     temperature=0.2, 
#     top_p=0.90, 
#     top_k=40,
#     max_new_tokens=script_args.seq_length,
#     repetition_penalty=1.1
# )

generate_kwargs = dict(
    input_ids=inputs,
    temperature=0.5, 
    top_p=1.0, 
    # top_p=0.9, 
    top_k=1000,
    max_new_tokens=4000,
    repetition_penalty=1.0
)

outputs = model.generate(**generate_kwargs)
predictions = tokenizer.decode(outputs[0])
print(f"Label: \n{sample_data['answer']}\n")
print("Prediction")
print(predictions.split("Answer: ")[-1].replace("</s>", ""))

Label: 
[{'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'MSMT17', 'Metric': 'Rank-1', 'Score': '86.2'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'MSMT17', 'Metric': 'Rank-10', 'Score': 'Paper Title:An Effective Data Augmentation for Person Re-identification.--github:https://github.com/finger-monkey/Data-Augmentation'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'MSMT17', 'Metric': 'Rank-5', 'Score': 'Paper title:A Person Re-identification Data Augmentation Method with Adversarial Defense Effect--github:https://github.com/finger-monkey/ReID_Adversarial_Defense'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'MSMT17', 'Metric': 'mAP', 'Score': '65.9'}}, {'LEADERBOARD': {'Task': 'Few-Shot Image Classification', 'Dataset': 'Mini-Imagenet 5-way (5-shot)', 'Metric': 'Accuracy', 'Score': '87.4'}}, {'LEADERBOARD': {'Task': 'Few-Shot Image Classification', 'Dataset': 'Mini-Imagenet 5-way (5-shot)', 'Metric': 'Acc

In [67]:
# predictions

# Compute metrics

In [46]:
# batch_decode

In [47]:
# # inputs = tokenizer.batch_encode(f"Question: {valid_ex['prompt']}", return_tensors="pt").to(device)
# inputs = tokenizer.batch_encode(eval_dataset, return_tensors="pt").to(device)

# # text = f"Question: {example['prompt']}\n\nAnswer: {example['answer']}"
# # generate_kwargs = dict(
# #     input_ids=inputs,
# #     temperature=0.2, 
# #     top_p=0.90, 
# #     top_k=40,
# #     max_new_tokens=script_args.seq_length,
# #     repetition_penalty=1.1
# # )

# generate_kwargs = dict(
#     input_ids=inputs,
#     temperature=0.2, 
#     top_p=0.95, 
#     top_k=40,
#     max_new_tokens=500,
#     repetition_penalty=1.3
# )

# outputs = model.generate(**generate_kwargs)
# # predictions = tokenizer.batch_decode(outputs[0])

In [48]:
len(valid_data)

2353

In [None]:
labels = []
preds = []
for i, valid_ex in tqdm(enumerate(valid_data), total=len(valid_data)):
    # if i <= 100 :
    #     continue 
        
    inputs = tokenizer.encode(f"Question: {valid_ex['prompt']}", return_tensors="pt").to(device)

    # text = f"Question: {example['prompt']}\n\nAnswer: {example['answer']}"
    # generate_kwargs = dict(
    #     input_ids=inputs,
    #     temperature=0.2, 
    #     top_p=0.90, 
    #     top_k=40,
    #     max_new_tokens=script_args.seq_length,
    #     repetition_penalty=1.1
    # )
    
    # generate_kwargs = dict(
    #     input_ids=inputs,
    #     temperature=0.2, 
    #     top_p=0.95, 
    #     top_k=40,
    #     max_new_tokens=script_args.seq_length,
    #     repetition_penalty=1.3
    # )

    generate_kwargs = dict(
        input_ids=inputs,
        temperature=0.5, 
        top_p=1.0, 
        # top_p=0.9, 
        top_k=1000,
        max_new_tokens=4000,
        repetition_penalty=1.0
    )
    
    outputs = model.generate(**generate_kwargs)
    predictions = tokenizer.decode(outputs[0])

    preds.append(predictions.split("Answer: ")[-1].replace("</s>", ""))
    labels.append(valid_ex['answer'])
    
    # if i >= 1000:
    #     break
    
    # ipdb.set_trace()

 33%|███▎      | 786/2353 [18:20:22<22:47:18, 52.35s/it]  

In [None]:
# labels_ = labels[:200]
# preds_ = preds[:200]

labels_ = labels
preds_ = preds

In [None]:
len(labels_)

In [None]:
labels_[10]

In [None]:
len(preds_)

In [None]:
preds_[10]

In [None]:
results = Metrics.evaluate_property_wise_json_based(label_list=labels_, prediction_list=preds_)
results.update(Metrics.evaluate_rouge(label_list=labels_, prediction_list=preds_))

print(f"Results:")
for key, value in results.items():
    print(f"{key}: {value}")

In [None]:
item_list

In [None]:
labels[0]

In [None]:
preds[0]

In [None]:
# clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

# clf_metrics.compute(predictions=[1 if "unanswerable" == x.replace("</s>", "") else 0 for x in preds], 
#                     references=[1 if "unanswerable" == x else 0 for x in labels]
#                     # references=[1 for df['answer'].tolist()]
# )


In [None]:
# rouge = evaluate.load('rouge')

# results = rouge.compute(
#     predictions=[pred.replace("</s>", "") for pred in preds],
#     # predictions=preds,
#     references=labels
# )
# results

In [None]:
# clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

# clf_metrics.compute(predictions=[1 if "unanswerable" == x.replace("</s>", "") else 0 for x in preds], 
#                     references=[1 if "unanswerable" == x else 0 for x in labels]
#                     # references=[1 for df['answer'].tolist()]
# )


In [None]:
# rouge = evaluate.load('rouge')

# results = rouge.compute(
#     predictions=[pred.replace("</s>", "") for pred in preds],
#     # predictions=preds,
#     references=labels
# )
# results

In [None]:
# results

In [None]:
# i = 0
# for x in tqdm(preds):
#     if "unanswerable" in x:
#         ipdb.set_trace()
#     else:
#         i+=1
        
#         # print(x)