In [1]:
# !pip install evaluate
# !pip install accelerate
# !pip install bitsandbytes
# !pip install -i https://test.pypi.org/simple/ bitsandbytes

# Fetching Datasets

In [2]:
#imports
import pandas as pd
import os, ipdb
import random, evaluate

import random
import string

# Fine-Tune Llama2-7b on custom dataset
import os, ipdb
from dataclasses import dataclass, field
from typing import Optional
import numpy as np
import torch, random
from datasets import DatasetDict, Dataset, load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from tqdm import tqdm
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer\
, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, TrainerCallback, pipeline

from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset


# from ../evaluation_metrics import Metrics
seed = 42
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

from evaluation_metrics import Metrics, THRESHOLD

os.environ["TOKENIZERS_PARALLELISM"] = "false" # or "true", depending on your needs

# pd.options.display.max_rows , pd.options.display.max_columns  = 100,100  

device = 'cuda' if torch.cuda.is_available() else "cpu"
device

2023-12-10 23:59:07.132044: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-10 23:59:08.202173: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'cuda'

# Inference

In [3]:
@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the model name"})
    log_with: Optional[str] = field(default="wandb", metadata={"help": "use 'wandb' to log with wandb"})

    dataset_name: Optional[str] = field(default="lvwerra/stack-exchange-paired", metadata={"help": "the dataset name"})
    subset: Optional[str] = field(default="data/finetune", metadata={"help": "the subset to use"})
    split: Optional[str] = field(default="train", metadata={"help": "the split to use"})
    size_valid_set: Optional[int] = field(default=4000, metadata={"help": "the size of the validation set"})
    streaming: Optional[bool] = field(default=True, metadata={"help": "whether to stream the dataset"})
    shuffle_buffer: Optional[int] = field(default=5000, metadata={"help": "the shuffle buffer size"})
    seq_length: Optional[int] = field(default=1024, metadata={"help": "the sequence length"})
    num_workers: Optional[int] = field(default=4, metadata={"help": "the number of workers"})

    max_steps: Optional[int] = field(default=500, metadata={"help": "the maximum number of sgd steps"})
    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
    save_steps: Optional[int] = field(default=10, metadata={"help": "the saving frequency"})
    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "the per device train batch size"})
    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "the per device eval batch size"})
    gradient_accumulation_steps: Optional[int] = field(default=2, metadata={"help": "the gradient accumulation steps"})
    gradient_checkpointing: Optional[bool] = field(
        default=True, metadata={"help": "whether to use gradient checkpointing"}
    )
    group_by_length: Optional[bool] = field(default=False, metadata={"help": "whether to group by length"})
    packing: Optional[bool] = field(default=True, metadata={"help": "whether to use packing for SFTTrainer"})

    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})

    learning_rate: Optional[float] = field(default=1e-4, metadata={"help": "the learning rate"})
    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
    num_warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})

    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})


parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses([])[0]

if script_args.group_by_length and script_args.packing:
    raise ValueError("Cannot use both packing and group by length")

In [4]:
# 2000/3

In [3]:
# ckpt = "/checkpoint-4350"
# script_args.model_name = "meta-llama/Llama-2-13b-hf"
# script_args.size = "13b"
script_args.seq_length = 2400
# script_args.seq_length = 1024


# script_args.model_name = "mistralai/Mistral-7B-v0.1"
script_args.model_name = "meta-llama/Llama-2-7b-hf"
script_args.size = "7b"
script_args.seq_length = 240

# docteat_llama2_13b_tdm_f1_all_template_seq_len_1024

i = 1
# # script_args.test_dataset = f"../data/LLLM_DOCTEAT_TDMS_SQUAD_{i}/fold1"
# # script_args.test_dataset = f"../data/LLLM_DOCTEAT_TDMS_DROP_{i}/fold2"
# script_args.test_ckpt = "best_checkpoint"
# # script_args.test_ckpt = "checkpoint-37500"
# # script_args.test_ckpt = "checkpoint-25000"
# # script_args.test_ckpt = "checkpoint-15000"
# # script_args.test_ckpt = "checkpoint-35500"

# script_args.dataset_name = "../data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/fold1"
# script_args.output_dir = f"../model_ckpt/docteat_flan_t5_large_tdms_f1_all_template"
# script_args.run_name = f"Evaluate_sft_docteat_llama2_tdms_f1_all_template"
# # script_args.test_ckpt = "checkpoint-103350"
# script_args.test_ckpt = "checkpoint-82680"

# script_args.output_dir = f"../model_ckpt{ckpt}"
# script_args.run_name = "sft_llama2_docteat_tdm_f2_all_Template"

# script_args.dataset_name = "../data/LLLM_DOCTEAT_TDMS_ALL_TEMPLATE/fold1"
# script_args.output_dir = f"../model_ckpt/docteat_llama2_{script_args.size}_tdms_f1_all_template_seq_len_{script_args.seq_length}"
# script_args.run_name = f"Evaluate_sft_docteat_llama2_{script_args.size}_tdms_f1_all_template_seq_len_{script_args.seq_length}"
# script_args.per_device_train_batch_size = 2
# script_args.gradient_accumulation_steps = 2
# script_args.per_device_eval_batch_size = 2

# multi GPU

script_args.test_dataset = f"../data/LLLM_LONG_TDMS_SQUAD_{i}/fold1"
# script_args.test_ckpt = "checkpoint-17000"
script_args.test_ckpt = "checkpoint-3000"
# script_args.seq_length = 2400
script_args.dataset_name = "../data/LLLM_LONG_TDMS_ALL_TEMPLATE/fold1"
script_args.output_dir = f"../model_ckpt/long_llama2_{script_args.size}_tdms_f1_all_template_seq_len_{script_args.seq_length}"
script_args.run_name = f"Evaluate_sft_long_llama2_{script_args.size}_tdms_f1_all_template_seq_len_{script_args.seq_length}"
script_args.per_device_train_batch_size = 6
script_args.gradient_accumulation_steps = 2

# script_args.output_dir = f"../model_ckpt/long_tdms_f1_all_template{ckpt}"
# script_args.run_name = "sft_llama2_long_tdms_f1_all_Template"
# script_args.seq_length = 2400
# script_args.per_device_train_batch_size = 1
# script_args.gradient_accumulation_steps = 1

script_args.save_steps = 5000
script_args.eval_steps = 5000
script_args.evaluation_strategy = 5000
script_args.logging_steps = 5000
script_args.streaming = False
script_args.num_train_epochs = 15
script_args.save_total_limit = 50

script_args.save_strategy = "steps" #"epoch"
script_args.evaluation_strategy= "steps" #"epoch",

In [4]:
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""
    text = f"Question: {example['prompt']}\n\nAnswer: {example['answer']}"
    # text = f"{example['prompt']}\n{example['answer']}"
    return text

def compute_metrics(eval_preds):
    
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]
        
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)  # type: ignore
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)  # type: ignore
    
    generate_kwargs = dict(
        input_ids=preds,
        temperature=0.2, 
        top_p=0.95, 
        top_k=40,
        max_new_tokens=500,
        repetition_penalty=1.3
    )
    ipdb.set_trace()
    # outputs = base_model.generate(**generate_kwargs)
    predictions = tokenizer.batch_decode(preds, skip_special_tokens=True)
    

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)  # type: ignore
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    

    results = Metrics.evaluate_property_wise_json_based(label_list=decoded_labels, prediction_list=decoded_preds)
    results.update(Metrics.evaluate_rouge(label_list=decoded_labels, prediction_list=decoded_preds))
        
    return results
    
def create_datasets(tokenizer, args):
    # dataset = load_dataset(
    #     args.dataset_name,
    #     data_dir=args.subset,
    #     split=args.split,
    #     use_auth_token=True,
    #     num_proc=args.num_workers if not args.streaming else None,
    #     streaming=args.streaming,
    # )
    
    dataset = DatasetDict.load_from_disk(f"{args.test_dataset}")
    dataset = dataset.shuffle(seed=seed)
    
    # if args.streaming:
    #     print("Loading the dataset in streaming mode")
    #     valid_data = dataset.take(args.size_valid_set)
    #     train_data = dataset.skip(args.size_valid_set)
    #     train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=None)
    # else:
    
    # dataset = dataset.train_test_split(test_size=0.005, seed=None)
    # valid_data = dataset["validation"].shard(num_shards=10, index=0).shuffle(seed=42)

    train_data = dataset["train"]
    valid_data = dataset["validation"].shard(num_shards=20, index=0).shuffle(seed=42)
    print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

    chars_per_token = chars_token_ratio(train_data, tokenizer, nb_examples=400)
    # chars_per_token = chars_token_ratio(train_data, tokenizer, nb_examples=len(train_data)//2)
    # 3.70
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=prepare_sample_text,
        infinite=True,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=prepare_sample_text,
        infinite=False,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset
    

In [5]:
script_args.output_dir

'../model_ckpt/long_llama2_7b_tdms_f1_all_template_seq_len_3000'

In [6]:
f"{script_args.output_dir}/{script_args.test_ckpt}"

'../model_ckpt/long_llama2_7b_tdms_f1_all_template_seq_len_3000/checkpoint-3000'

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
# base_model = AutoModelForCausalLM.from_pretrained(
#     script_args.model_name,
#     quantization_config=bnb_config,
#     device_map={"": 0},
#     trust_remote_code=True,
#     use_auth_token="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",
#     # use_auth_token=True,
# )

# base_model.config.use_cache = False

In [8]:
# model = AutoModelForCausalLM.from_pretrained(
#     base_model,
#     quantization_config=quant_config,
#     device_map={"": 0}
# )
# model.config.use_cache = False
# model.config.pretraining_tp = 1

In [9]:
f"{script_args.output_dir}/save_pretrained"

'../model_ckpt/long_llama2_7b_tdms_f1_all_template_seq_len_3000/save_pretrained'

In [10]:
# model = AutoPeftModelForCausalLM.from_pretrained(f"{script_args.output_dir}/{script_args.test_ckpt}", device_map="auto", torch_dtype=torch.bfloat16)
# model = model.merge_and_unload()

# output_merged_dir = os.path.join(script_args.output_dir, "final_merged_checkpoint")
# model.save_pretrained(output_merged_dir, safe_serialization=True)

In [11]:
# base_model = AutoModelForCausalLM.from_pretrained(
#     # f"{script_args.output_dir}/{script_args.test_ckpt}",
#     script_args.model_name,
#     quantization_config=bnb_config,
#     # quantization_config=quant_config,
#     device_map={"": 0},
#     trust_remote_code=True,
#     use_auth_token="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",
#     # use_auth_token=True,
# )


# # base_model.config.use_cache = False
# # base_model.config.pretraining_tp = 1

# base_model = AutoPeftModelForCausalLM.from_pretrained(
#     # script_args.model_name,
#     f"{script_args.output_dir}/{script_args.test_ckpt}",
#     low_cpu_mem_usage=True,
#     torch_dtype=torch.bfloat16,
#     load_in_4bit=True,
#     use_auth_token="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",
#     # use_auth_token="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"
# )

# tokenizer = AutoTokenizer.from_pretrained(
#     f"{script_args.output_dir}/{script_args.test_ckpt}",
#     use_auth_token="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",
# )

# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

In [12]:
script_args.lora_r

8

In [13]:
# # peft_config = LoraConfig(
# #     r=script_args.lora_r,
# #     lora_alpha=script_args.lora_alpha,
# #     lora_dropout=script_args.lora_dropout,
# #     target_modules=["q_proj", "v_proj"],
# #     bias="none",
# #     task_type="CAUSAL_LM",
# # )

# peft_args = LoraConfig(
#     lora_alpha=16,
#     lora_dropout=0.1,
#     r=64,
#     bias="none",
#     task_type="CAUSAL_LM",
# )

# # tokenizer = AutoTokenizer.from_pretrained(
# #     script_args.model_name, 
# #     use_auth_token="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",
# #     trust_remote_code=True
# # )

# # tokenizer.pad_token = tokenizer.eos_token
# # tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

# # tokenizer.add_tokens(AddedToken("{", normalized=False))
# # tokenizer.add_tokens(AddedToken("}", normalized=False))


# # # https://github.com/tatsu-lab/stanford_alpaca/issues/133#issuecomment-1483893538
# # training_args = TrainingArguments(
# #     output_dir=script_args.output_dir,
# #     per_device_train_batch_size=script_args.per_device_train_batch_size,
# #     gradient_accumulation_steps=script_args.gradient_accumulation_steps,
# #     per_device_eval_batch_size=script_args.per_device_eval_batch_size,
# #     learning_rate=script_args.learning_rate,
# #     logging_steps=script_args.logging_steps,
# #     # max_steps=script_args.max_steps,
# #     # report_to=script_args.log_with,
# #     save_steps=script_args.save_steps,
# #     evaluation_strategy=script_args.evaluation_strategy,
# #     save_strategy=script_args.save_strategy,
# #     eval_steps = script_args.eval_steps,
# #     load_best_model_at_end=True,
# #     save_total_limit=script_args.save_total_limit,
# #     group_by_length=script_args.group_by_length,
# #     lr_scheduler_type=script_args.lr_scheduler_type,
# #     warmup_steps=script_args.num_warmup_steps,
# #     optim=script_args.optimizer_type,
# #     # bf16=True,
# #     fp16=True,
# #     remove_unused_columns=False,
# #     num_train_epochs = script_args.num_train_epochs,
# #     run_name=script_args.run_name,
# # )

# # Set training parameters
# training_params = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=1,
#     per_device_train_batch_size=4,
#     gradient_accumulation_steps=1,
#     optim="paged_adamw_32bit",
#     save_steps=25,
#     logging_steps=25,
#     learning_rate=2e-4,
#     weight_decay=0.001,
#     fp16=False,
#     bf16=False,
#     max_grad_norm=0.3,
#     max_steps=-1,
#     warmup_ratio=0.03,
#     group_by_length=True,
#     # lr_scheduler_type="constant",
#     # report_to="tensorboard"
# )


# print(torch.__version__)

# train_dataset, eval_dataset = create_datasets(tokenizer, script_args)

# len(train_dataset)

# num_gpus = torch.cuda.device_count()
# print(f"Number of GPUs available: {num_gpus}")


# # expected_steps = ((len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)) * training_args.num_train_epochs)// num_gpus
# # # expected_steps = (len(train_dataset) // (training_args.per_device_train_batch_size)) * training_args.num_train_epochs
# # print(f"Expected steps: {expected_steps}")

# # print(f"Max token lenght: {tokenizer.model_max_length}")
# # print(f"Test Batch size: {script_args.per_device_eval_batch_size * script_args.gradient_accumulation_steps * num_gpus }")
# # print(f"Number of GPUs available: {num_gpus}")

# # # print(script_args)|
# # trainer = SFTTrainer(
# #     model=base_model,
# #     train_dataset=train_dataset,
# #     eval_dataset=eval_dataset,
# #     peft_config=peft_config,
# #     packing=script_args.packing,
# #     # max_seq_length=None,
# #     # max_seq_length=script_args.seq_length,
# #     tokenizer=tokenizer,
# #     args=training_args,
# #     compute_metrics=compute_metrics,
# # )

# trainer = SFTTrainer(
#     model=base_model,
#     train_dataset=train_dataset,
#     peft_config=peft_args,
#     dataset_text_field="text",
#     max_seq_length=None,
#     tokenizer=tokenizer,
#     args=training_params,
#     packing=False,
# )

# # trainer.evaluate()

# # Train model
# # trainer.train()


In [14]:
f"{script_args.output_dir}/{script_args.test_ckpt}"

'../model_ckpt/long_llama2_7b_tdms_f1_all_template_seq_len_3000/checkpoint-3000'

In [15]:
model = AutoPeftModelForCausalLM.from_pretrained(
    # script_args.model_name,
    f"{script_args.output_dir}/{script_args.test_ckpt}",
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    load_in_4bit=True,
    use_auth_token="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",
    # use_auth_token="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"
)

# model.config.use_cache = False


tokenizer = AutoTokenizer.from_pretrained(
    f"{script_args.output_dir}/{script_args.test_ckpt}",
    use_auth_token="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",
)

# model = model.cpu()
# model = model.to("cpu")

# torch.cuda.empty_cache()
# # model.generate()

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
# model.config.use_cache = True

In [17]:
dataset = DatasetDict.load_from_disk(f"{script_args.test_dataset}")
    
    # if args.streaming:
    #     print("Loading the dataset in streaming mode")
    #     valid_data = dataset.take(args.size_valid_set)
    #     train_data = dataset.skip(args.size_valid_set)
    #     train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=None)
    # else:
    
# dataset = dataset.train_test_split(test_size=0.005, seed=None)
train_data = dataset["train"].shuffle(seed=42)
valid_data = dataset["validation"].shuffle(seed=42)
# valid_data = dataset["validation"].shard(num_shards=10, index=0).shuffle(seed=42)

# train_data[0]
len(valid_data)

2353

In [19]:
idx = random.randint(0, len(valid_data))

print(f"Index: {idx}\n")

print(f"Question: {valid_data[idx]['prompt']}")
print("\n#################################################\n")
print(f"Answer: {valid_data[idx]['answer']}")

Index: 102

Question: Title	From Big to Small: Multi-Scale Local Planar Guidance
for Monocular Depth Estimation

Abstract:	Estimating accurate depth from a single image is challenging because it is an ill-posed problem as infinitely many 3D scenes can be projected to the same 2D scene. However, recent works based on deep convolutional neural networks show great progress with plausible results. The convolutional neural networks are generally composed of two parts: an encoder for dense feature extraction and a decoder for predicting the desired depth. In the encoder-decoder schemes, repeated strided convolution and spatial pooling layers lower the spatial resolution of transitional outputs, and several techniques such as skip connections or multi-layer deconvolutional networks are adopted to recover back to the original resolution for effective dense prediction.

In this paper, for more effective guidance of densely encoded features to the desired depth prediction, we propose a network a

In [20]:
# prompt = "Who is Leonardo Da Vinci?"
prompt = f"Question: {valid_data[idx]['prompt']}"

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=script_args.seq_length)
# pipe = pipeline(task="text-generation", model=f"{script_args.output_dir}/save_pretrained", tokenizer=tokenizer, max_length=script_args.seq_length)

# result = pipe(f"<s>[INST] {prompt} [/INST]")
result = pipe(f"{prompt}")
print(result[0]['generated_text'])

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'Peg

OutOfMemoryError: CUDA out of memory. Tried to allocate 7.59 GiB. GPU 0 has a total capacty of 23.69 GiB of which 7.24 GiB is free. Including non-PyTorch memory, this process has 16.45 GiB memory in use. Of the allocated memory 15.88 GiB is allocated by PyTorch, and 263.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [27]:
prompt = f"Question: {valid_data[idx]['prompt']}"

result = pipe(f"{prompt}")
print(result[0]['generated_text'])

Question: From Big to Small: Multi-Scale Local Planar Guidance for Monocular Depth Estimation Estimating accurate depth from a single image is challenging because it is an ill-posed problem as infinitely many 3D scenes can be projected to the same 2D scene. However, recent works based on deep convolutional neural networks show great progress with plausible results. The convolutional neural networks are generally composed of two parts: an encoder for dense feature extraction and a decoder for predicting the desired depth. In the encoderdecoder schemes, repeated strided convolution and spatial pooling layers lower the spatial resolution of transitional outputs, and several techniques such as skip connections or multi-layer deconvolutional networks are adopted to recover back to the original resolution for effective dense prediction.In this paper, for more effective guidance of densely encoded features to the desired depth prediction, we propose a network architecture that utilizes novel 

In [28]:
print(f"Answer: {valid_data[idx]['answer']}")

Answer: [{'LEADERBOARD': {'Task': 'Depth Estimation', 'Dataset': 'NYU-Depth V2', 'Metric': 'RMS', 'Score': '0.407'}}, {'LEADERBOARD': {'Task': 'Monocular Depth Estimation', 'Dataset': 'KITTI Eigen split', 'Metric': 'absolute relative error', 'Score': '0.064'}}, {'LEADERBOARD': {'Task': 'Monocular Depth Estimation', 'Dataset': 'NYU-Depth V2', 'Metric': 'RMSE', 'Score': '0.392'}}]


In [14]:
dataset = DatasetDict.load_from_disk(f"{script_args.test_dataset}")
    
    # if args.streaming:
    #     print("Loading the dataset in streaming mode")
    #     valid_data = dataset.take(args.size_valid_set)
    #     train_data = dataset.skip(args.size_valid_set)
    #     train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=None)
    # else:
    
# dataset = dataset.train_test_split(test_size=0.005, seed=None)
train_data = dataset["train"].shuffle(seed=42)
valid_data = dataset["validation"].shuffle(seed=42)
# valid_data = dataset["validation"].shard(num_shards=10, index=0).shuffle(seed=42)

# train_data[0]
len(valid_data)

2353

In [15]:
# valid_data[7]

In [16]:
len(valid_data[97]['prompt'].split())

371

In [14]:
# Randomly select 500 indices from valid_data
# selected_indices = random.sample(range(len(valid_data)), 1)
# selected_indices


In [20]:
idx = random.randint(0, len(valid_data))

print(f"Index: {idx}\n")

print(f"Question: {valid_data[idx]['prompt']}")
print("\n#################################################\n")
print(f"Answer: {valid_data[idx]['answer']}")

Index: 102

Question: From Big to Small: Multi-Scale Local Planar Guidance for Monocular Depth Estimation Estimating accurate depth from a single image is challenging because it is an ill-posed problem as infinitely many 3D scenes can be projected to the same 2D scene. However, recent works based on deep convolutional neural networks show great progress with plausible results. The convolutional neural networks are generally composed of two parts: an encoder for dense feature extraction and a decoder for predicting the desired depth. In the encoderdecoder schemes, repeated strided convolution and spatial pooling layers lower the spatial resolution of transitional outputs, and several techniques such as skip connections or multi-layer deconvolutional networks are adopted to recover back to the original resolution for effective dense prediction.In this paper, for more effective guidance of densely encoded features to the desired depth prediction, we propose a network architecture that uti

In [21]:
prompt = f"Question: {valid_data[idx]['prompt']}"

# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=script_args.seq_length)

# result = pipe(f"<s>[INST] {prompt} [/INST]")
result = pipe(f"<s>[INST] {prompt}")
print(result[0]['generated_text'])

<s>[INST] Question: From Big to Small: Multi-Scale Local Planar Guidance for Monocular Depth Estimation Estimating accurate depth from a single image is challenging because it is an ill-posed problem as infinitely many 3D scenes can be projected to the same 2D scene. However, recent works based on deep convolutional neural networks show great progress with plausible results. The convolutional neural networks are generally composed of two parts: an encoder for dense feature extraction and a decoder for predicting the desired depth. In the encoderdecoder schemes, repeated strided convolution and spatial pooling layers lower the spatial resolution of transitional outputs, and several techniques such as skip connections or multi-layer deconvolutional networks are adopted to recover back to the original resolution for effective dense prediction.In this paper, for more effective guidance of densely encoded features to the desired depth prediction, we propose a network architecture that utili

In [31]:
# df_valid_data = valid_data.to_pandas()
# df_valid_data.tail()

In [32]:
# str(df_valid_data.at[35275, 'prompt'])

In [18]:
# str(df_valid_data.at[35275, 'prompt'])
script_args.seq_length

1024

When generating text using models like GPT-3 or GPT-4 from OpenAI, several hyperparameters can be tweaked to influence the output. Here's a breakdown of the parameters you mentioned:

1. **`temperature`**: 
    - **Purpose**: Adjusts the randomness of the model's outputs.
    - **Values**:
        - Closer to 0: The model will be more deterministic and more likely to produce the most probable next word at each step.
        - Closer to 1: The model's outputs become more random.
    - **`temperature=0.2`**: In this case, the model's outputs will be more deterministic and confident. There's less randomness.

2. **`top_p` (also known as "nucleus sampling")**:
    - **Purpose**: Prunes the vocabulary before sampling the next word.
    - **Values**:
        - At `top_p=1.0`: Use all words in the vocabulary.
        - At `top_p=0.95`: Use only the smallest set of words such that their cumulative probability exceeds 0.95.
    - **`top_p=0.95`**: The model will only consider the top words that have a cumulative probability of 95%. It helps in reducing the chance of very random words appearing in the generated text.

3. **`top_k`**:
    - **Purpose**: Restricts the model's prediction to the top `k` most likely next words.
    - **Values**: Larger values make outputs more random, while smaller values make it less random.
    - **`top_k=40`**: The model will only consider the top 40 words for its next word prediction. It's another way to reduce randomness, but it's typically used in conjunction with `top_p` for better results.

4. **`max_new_tokens`**:
    - **Purpose**: Limits the length of the generated output.
    - **`max_new_tokens=script_args.seq_length`**: The generated output will be limited to the length specified by `script_args.seq_length`. It ensures that the model doesn't generate exceedingly long responses.

5. **`repetition_penalty`**:
    - **Purpose**: Penalizes words that are already seen in the output, especially if repeated multiple times.
    - **Values**:
        - Equal to 1: No penalty applied.
        - Greater than 1: Apply a penalty.
    - **`repetition_penalty=1.0`**: In this case, no penalty is applied for repeated words.

In [19]:
# generate_kwargs = dict(
#     input_ids=eval_dataset,
#     temperature=0.1, 
#     top_p=1.0, 
#     # top_p=0.9, 
#     top_k=20,
#     max_new_tokens=4000,
#     repetition_penalty=1.0
# )

# outputs = base_model.generate(**generate_kwargs)

In [20]:
# # device = 'cpu'

# # tokenizer.cpu()

# # tokenizer = AutoTokenizer.from_pretrained(model_name)

# sample_data = valid_data[5]
# # inputs = tokenizer.encode(f"{train_data[100]['prompt'][-7000:]}", return_tensors="pt").to(device)
# # inputs = tokenizer.encode(f"{train_data[0]['prompt']}", return_tensors="pt").to(device)
# inputs = tokenizer.encode(f"Question: {sample_data['prompt']}", return_tensors="pt").to(device)

# # text = f"Question: {example['prompt']}\n\nAnswer: {example['answer']}"
# # generate_kwargs = dict(
# #     input_ids=inputs,
# #     temperature=0.2, 
# #     top_p=0.90, 
# #     top_k=40,
# #     max_new_tokens=script_args.seq_length,
# #     repetition_penalty=1.1
# # )

# # generate_kwargs = dict(
# #         input_ids=inputs,
# #         temperature=0.5, 
# #         top_p=1.0, 
# #         # top_p=0.9, 
# #         top_k=1000,
# #         max_new_tokens=4000,
# #         repetition_penalty=1.0
# #     )

# generate_kwargs = dict(
#     input_ids=inputs,
#     temperature=0.1, 
#     top_p=1.0, 
#     # top_p=0.9, 
#     top_k=20,
#     max_new_tokens=4000,
#     repetition_penalty=1.0
# )


# outputs = base_model.generate(**generate_kwargs)
# predictions = tokenizer.decode(outputs[0])
# print(f"Label: \n{sample_data['answer']}\n")
# print("Prediction")
# print(predictions.split("Answer: ")[-1].replace("</s>", ""))

In [21]:
# outputs[0]

In [22]:
# outputs[0].shape
# # tokenizer.decode(outputs[0])

# Compute metrics

In [23]:
# batch_decode

In [24]:
# # inputs = tokenizer.batch_encode(f"Question: {valid_ex['prompt']}", return_tensors="pt").to(device)
# inputs = tokenizer.batch_encode(eval_dataset, return_tensors="pt").to(device)

# # text = f"Question: {example['prompt']}\n\nAnswer: {example['answer']}"
# # generate_kwargs = dict(
# #     input_ids=inputs,
# #     temperature=0.2, 
# #     top_p=0.90, 
# #     top_k=40,
# #     max_new_tokens=script_args.seq_length,
# #     repetition_penalty=1.1
# # )

# generate_kwargs = dict(
#     input_ids=inputs,
#     temperature=0.2, 
#     top_p=0.95, 
#     top_k=40,
#     max_new_tokens=500,
#     repetition_penalty=1.3
# )

# outputs = model.generate(**generate_kwargs)
# # predictions = tokenizer.batch_decode(outputs[0])

In [25]:
len(valid_data)

2353

In [26]:
labels = []
preds = []

# # Randomly select 500 indices from valid_data
# selected_indices = random.sample(range(len(valid_data)), 500)

# # Create a new list of selected examples
# selected_valid_data = [valid_data[i] for i in selected_indices]

# for i, valid_ex in tqdm(enumerate(selected_valid_data), total=500):
#     # Your processing code here
    
for i, valid_ex in tqdm(enumerate(valid_data), total=len(valid_data)):
    # if i <= 100 :
    #     continue 

    if len(valid_ex['prompt'].split()) >= 2400:
        continue
        
    inputs = tokenizer.encode(f"Question: {valid_ex['prompt']}", return_tensors="pt").to(device)

    # text = f"Question: {example['prompt']}\n\nAnswer: {example['answer']}"
    # generate_kwargs = dict(
    #     input_ids=inputs,
    #     temperature=0.2, 
    #     top_p=0.90, 
    #     top_k=40,
    #     max_new_tokens=script_args.seq_length,
    #     repetition_penalty=1.1
    # )
    
    # generate_kwargs = dict(
    #     input_ids=inputs,
    #     temperature=0.2, 
    #     top_p=0.95, 
    #     top_k=40,
    #     max_new_tokens=script_args.seq_length,
    #     repetition_penalty=1.3
    # )

    generate_kwargs = dict(
        input_ids=inputs,
        temperature=0.5, 
        top_p=1.0, 
        # top_p=0.9, 
        top_k=1000,
        max_new_tokens=4000,
        repetition_penalty=1.0
    )
    
    outputs = base_model.generate(**generate_kwargs)
    predictions = tokenizer.decode(outputs[0])

    preds.append(predictions.split("Answer: ")[-1].replace("</s>", ""))
    labels.append(valid_ex['answer'])
    
    if i >= 5:
        break
    
    # ipdb.set_trace()



  0%|          | 5/2353 [04:27<34:52:22, 53.47s/it]


In [27]:
# labels_ = labels[:200]
# preds_ = preds[:200]

labels_ = labels
preds_ = preds

In [28]:
len(labels_)

6

In [29]:
# labels_[10]

IndexError: list index out of range

In [None]:
len(preds_)

In [30]:
# preds_[10]

In [31]:
results = Metrics.evaluate_property_wise_json_based(label_list=labels_, prediction_list=preds_)
results.update(Metrics.evaluate_rouge(label_list=labels_, prediction_list=preds_))

print(f"Results:")
for key, value in results.items():
    print(f"{key}: {value}")

> [0;32m/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/notebooks/evaluation_metrics.py[0m(403)[0;36mmake_list_of_pairs_json_based[0;34m()[0m
[0;32m    402 [0;31m[0;34m[0m[0m
[0m[0;32m--> 403 [0;31m                [0mitem1_str[0m [0;34m=[0m [0mitem1[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    404 [0;31m                [0mitem2_str[0m [0;34m=[0m [0mitem2[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  L


*** NameError: name 'L' is not defined


ipdb>  l


[1;32m    398 [0m        [0;32mfor[0m [0mitem1[0m [0;32min[0m [0mlabel_contribution_list[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m    399 [0m            [0;32mfor[0m [0mitem2[0m [0;32min[0m [0mprediction_contribution_list[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m    400 [0m[0;34m[0m[0m
[1;32m    401 [0m                [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m    402 [0m[0;34m[0m[0m
[0;32m--> 403 [0;31m                [0mitem1_str[0m [0;34m=[0m [0mitem1[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m    404 [0m                [0mitem2_str[0m [0;34m=[0m [0mitem2[0m[0;34m[0m[0;34m[0m[0m
[1;32m    405 [0m                [0;32mif[0m [0misinstance[0m[0;34m([0m[0mitem1[0m[0;34m,[0m [0mdict[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m    406 [0m                    [0mitem1_str[0m [0;34m=[0m [0mjson[0m[0;34m.[0m[0mdumps[0m[0;34m([0m[0mitem

ipdb>  item1


{'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501', 'Metric': 'MAP', 'Score': '89.5'}}


ipdb>  item2


{'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-1', 'Score': '82.5%'}}


ipdb>  label_list


["[{'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501', 'Metric': 'MAP', 'Score': '89.5'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501', 'Metric': 'Rank-1', 'Score': '95.7'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'MAP', 'Score': '81.84'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-1', 'Score': '91.11'}}]", 'unanswerable', "[{'LEADERBOARD': {'Task': 'Multi-Label Classification', 'Dataset': 'MS-COCO', 'Metric': 'mAP', 'Score': '77.1'}}, {'LEADERBOARD': {'Task': 'Multi-Label Classification', 'Dataset': 'NUS-WIDE', 'Metric': 'MAP', 'Score': '62.0'}}]", "[{'LEADERBOARD': {'Task': 'Image Generation', 'Dataset': 'RC-49', 'Metric': 'Intra-FID', 'Score': '0.389'}}]", "[{'LEADERBOARD': {'Task': 'Unsupervised Video Object Segmentation', 'Dataset': 'DAVIS 2016', 'Metric': 'F-measure (Decay)', 'Score': '1.8'}}, {'LEADERBOARD': {'Tas

ipdb>  len(label_list)


6


ipdb>  label_list[0]


"[{'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501', 'Metric': 'MAP', 'Score': '89.5'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501', 'Metric': 'Rank-1', 'Score': '95.7'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'MAP', 'Score': '81.84'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-1', 'Score': '91.11'}}]"


ipdb>  prediction_list[0]


"[{'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-1', 'Score': '82.5%'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-10', 'Score': '94.5%'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-20', 'Score': '96.9%'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-5', 'Score': '92.5%'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'mAP', 'Score': '67.9%'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501-Dets-0.25', 'Metric': 'MAP', 'Score': '78.1'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501-Dets-0.25', 'Metric': 'Rank-1', 'Score': '80.5'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501-Dets-0.25', 'Metric': 'Rank-10', 'Score': '92.0'}}, {'LEADERBOAR

ipdb>  label


"[{'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501', 'Metric': 'MAP', 'Score': '89.5'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501', 'Metric': 'Rank-1', 'Score': '95.7'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'MAP', 'Score': '81.84'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-1', 'Score': '91.11'}}]"


ipdb>  prediction


"[{'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-1', 'Score': '82.5%'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-10', 'Score': '94.5%'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-20', 'Score': '96.9%'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-5', 'Score': '92.5%'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'mAP', 'Score': '67.9%'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501-Dets-0.25', 'Metric': 'MAP', 'Score': '78.1'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501-Dets-0.25', 'Metric': 'Rank-1', 'Score': '80.5'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501-Dets-0.25', 'Metric': 'Rank-10', 'Score': '92.0'}}, {'LEADERBOAR

ipdb>  ll


[1;32m    391 [0m[0;32mdef[0m [0mmake_list_of_pairs_json_based[0m[0;34m([0m[0mlabel_list[0m[0;34m,[0m [0mprediction_list[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m    392 [0m    [0;31m# make list of (label,prediction,similarity)[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m    393 [0m    [0mlist_of_label_prediction_pairs[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[1;32m    394 [0m    [0;32mfor[0m [0mlabel[0m[0;34m,[0m [0mprediction[0m [0;32min[0m [0mzip[0m[0;34m([0m[0mlabel_list[0m[0;34m,[0m [0mprediction_list[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m    395 [0m        [0mpair_list[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[1;32m    396 [0m        [0mlabel_contribution_list[0m [0;34m=[0m [0mget_contribution_list_json_based[0m[0;34m([0m[0mlabel[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m    397 [0m        [0mprediction_contribution_list

ipdb>  label


"[{'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501', 'Metric': 'MAP', 'Score': '89.5'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501', 'Metric': 'Rank-1', 'Score': '95.7'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'MAP', 'Score': '81.84'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-1', 'Score': '91.11'}}]"


ipdb>  get_contribution_list_json_based(label)


[{'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501', 'Metric': 'MAP', 'Score': '89.5'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'Market-1501', 'Metric': 'Rank-1', 'Score': '95.7'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'MAP', 'Score': '81.84'}}, {'LEADERBOARD': {'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-1', 'Score': '91.11'}}]


ipdb>  get_contribution_list_json_based("unanswerable")


['unanswerable']


ipdb>  q


In [None]:
item_list

In [None]:
labels[0]

In [None]:
preds[0]

In [None]:
# clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

# clf_metrics.compute(predictions=[1 if "unanswerable" == x.replace("</s>", "") else 0 for x in preds], 
#                     references=[1 if "unanswerable" == x else 0 for x in labels]
#                     # references=[1 for df['answer'].tolist()]
# )


In [None]:
# rouge = evaluate.load('rouge')

# results = rouge.compute(
#     predictions=[pred.replace("</s>", "") for pred in preds],
#     # predictions=preds,
#     references=labels
# )
# results

In [None]:
# clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

# clf_metrics.compute(predictions=[1 if "unanswerable" == x.replace("</s>", "") else 0 for x in preds], 
#                     references=[1 if "unanswerable" == x else 0 for x in labels]
#                     # references=[1 for df['answer'].tolist()]
# )


In [None]:
# rouge = evaluate.load('rouge')

# results = rouge.compute(
#     predictions=[pred.replace("</s>", "") for pred in preds],
#     # predictions=preds,
#     references=labels
# )
# results

In [None]:
# results

In [None]:
# i = 0
# for x in tqdm(preds):
#     if "unanswerable" in x:
#         ipdb.set_trace()
#     else:
#         i+=1
        
#         # print(x)