In [1]:
# Fine-Tune Llama2-7b on custom dataset
import os, ipdb
import random

import numpy as np
import torch
from fuzzywuzzy import fuzz

import pandas as pd
import ast
from tqdm import tqdm
from dataclasses import dataclass, field
from typing import Optional

from datasets import DatasetDict, Dataset, load_from_disk
from tokenizers import AddedToken
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, EarlyStoppingCallback
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, HfArgumentParser
from transformers.optimization import Adafactor, AdafactorSchedule

import random, evaluate


from evaluation_metrics import Metrics, THRESHOLD
seed = 42
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

os.environ["TOKENIZERS_PARALLELISM"] = "false" # or "true", depending on your needs

# pd.options.display.max_rows , pd.options.display.max_columns  = 100,100  

device = 'cuda' if torch.cuda.is_available() else "cpu"
device

2023-10-26 16:21:28.085603: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'cuda'

In [2]:
THRESHOLD

80

In [3]:
# !pip install fuzzywuzzy --user

In [4]:
# import ipywidgets as widget
# widget.IntSlider()

In [5]:
# from huggingface_hub import notebook_login
# notebook_login()

In [6]:
@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default="google/flan-t5", metadata={"help": "the model name"})
    log_with: Optional[str] = field(default="wandb", metadata={"help": "use 'wandb' to log with wandb"})

    dataset_name: Optional[str] = field(default="lvwerra/stack-exchange-paired", metadata={"help": "the dataset name"})
    subset: Optional[str] = field(default="data/finetune", metadata={"help": "the subset to use"})
    split: Optional[str] = field(default="train", metadata={"help": "the split to use"})
    size_valid_set: Optional[int] = field(default=4000, metadata={"help": "the size of the validation set"})
    streaming: Optional[bool] = field(default=True, metadata={"help": "whether to stream the dataset"})
    shuffle_buffer: Optional[int] = field(default=5000, metadata={"help": "the shuffle buffer size"})
    seq_length: Optional[int] = field(default=1024, metadata={"help": "the sequence length"})
    num_workers: Optional[int] = field(default=4, metadata={"help": "the number of workers"})

    max_steps: Optional[int] = field(default=500, metadata={"help": "the maximum number of sgd steps"})
    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
    save_steps: Optional[int] = field(default=10, metadata={"help": "the saving frequency"})
    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "the per device train batch size"})
    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "the per device eval batch size"})
    gradient_accumulation_steps: Optional[int] = field(default=2, metadata={"help": "the gradient accumulation steps"})
    gradient_checkpointing: Optional[bool] = field(
        default=True, metadata={"help": "whether to use gradient checkpointing"}
    )
    group_by_length: Optional[bool] = field(default=False, metadata={"help": "whether to group by length"})
    packing: Optional[bool] = field(default=True, metadata={"help": "whether to use packing for SFTTrainer"})

    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})

    learning_rate: Optional[float] = field(default=1e-4, metadata={"help": "the learning rate"})
    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
    num_warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})

    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})


parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses([])[0]


if script_args.group_by_length and script_args.packing:
    raise ValueError("Cannot use both packing and group by length")
# script_args.per_device_train_batch_size,
script_args.gradient_accumulation_steps,
# script_args.per_device_eval_batch_size,

script_args.seq_length

# script_args.dataset_name = "./data/LLLM_TDMS_ALL_TEMPLATE/fold1"
# script_args.output_dir = "./model_ckpt/tdms_all_template_v2"
# script_args.run_name = "sft_llama2_tdms_all_Template_v2"

# script_args.dataset_name = "./data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/fold2"
# script_args.output_dir = "./model_ckpt/docteat_tdm_f2_all_template"
# script_args.run_name = "sft_llama2_docteat_tdm_f2_all_Template"

script_args.model_name = "google/flan-t5"
script_args.size = "large"
script_args.test_dataset = "../data/LLLM_DOCTEAT_TDMS_DROP_1/fold2"

script_args.dataset_name = "../data/LLLM_DOCTEAT_TDMS_ALL_TEMPLATE/fold2"
# script_args.test_ckpt = "checkpoint-103350"
# script_args.test_ckpt = "checkpoint-82680"
script_args.test_ckpt = "checkpoint-40000"
script_args.output_dir = f"../model_ckpt/docteat_flan_t5_{script_args.size}_tdms_f2_all_template_final_2"
script_args.run_name = f"eval_sft_docteat_flan_t5_{script_args.size}_tdms_f2_all_template_final_2"

# script_args.dataset_name = "./data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/fold2"
# script_args.output_dir = f"./model_ckpt/docteat_flan_t5_{script_args.size}_tdm_f2_all_template"
# script_args.run_name = f"sft_docteat_flan_t5_{script_args.size}_tdm_f2_all_Template"

script_args.seq_length = 512
script_args.per_device_train_batch_size = 4
script_args.gradient_accumulation_steps = 2
script_args.per_device_eval_batch_size = 42
script_args.max_source_length = 512
script_args.max_target_length = 512
script_args.label_pad_token_id = -100
script_args.pad_to_multiple_of = 8
script_args.model_max_length = 512

# # multi GPU
# script_args.per_device_train_batch_size = 4

# script_args.dataset_name = "./data/LLLM_LONG_TDM_ALL_TEMPLATE/fold1"
# script_args.output_dir = "./model_ckpt/long_tdm_f1_all_template"
# script_args.run_name = "sft_llama2_long_tdm_f1_all_Template"
# script_args.seq_length = 2400
# script_args.per_device_train_batch_size = 2
# script_args.gradient_accumulation_steps = 2

script_args.save_steps = 50
script_args.logging_steps = 50
script_args.streaming = False
script_args.num_train_epochs = 5
script_args.save_total_limit = 10
script_args.fuzz_ratio = 50

In [7]:
script_args.seq_length

512

In [8]:
tokenizer = AutoTokenizer.from_pretrained(f"{script_args.model_name}-{script_args.size}")

tokenizer.add_tokens(AddedToken("\n", normalized=False))
tokenizer.add_tokens(AddedToken("{", normalized=False))
tokenizer.add_tokens(AddedToken("}", normalized=False))

# model = AutoModelForSeq2SeqLM.from_pretrained(f"{script_args.model_name}-{script_args.size}")

# tokenizer = AutoTokenizer.from_pretrained(f"{script_args.output_dir}/{script_args.test_ckpt}")
model = AutoModelForSeq2SeqLM.from_pretrained(f"{script_args.output_dir}/{script_args.test_ckpt}")


data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=script_args.label_pad_token_id,
    pad_to_multiple_of=script_args.label_pad_token_id
)

print(f"Max token lenght: {tokenizer.model_max_length}")

num_gpus = torch.cuda.device_count()

print(f"Max token lenght: {tokenizer.model_max_length}")
print(f"Batch size: {script_args.per_device_train_batch_size * script_args.gradient_accumulation_steps * num_gpus }")
print(f"Number of GPUs available: {num_gpus}")

Max token lenght: 512
Max token lenght: 512
Batch size: 8
Number of GPUs available: 1


In [34]:
dataset = DatasetDict.load_from_disk(f"{script_args.test_dataset}")

dataset = dataset.shuffle(seed=seed)

train_dataset = dataset["train"]
# eval_dataset = dataset["validation"]
# train_dataset = dataset["train"].shard(num_shards=1000, index=0)
eval_dataset = dataset["validation"].shard(num_shards=5, index=0)

print(f"length train_dataset: {len(train_dataset)}")
print(f"length eval_dataset: {len(eval_dataset)}")

length train_dataset: 5513
length eval_dataset: 471


In [18]:
# # eval_dataset[0]
# try:
        
# except :
    

In [19]:
def clean_and_parse(list_string):
    return_list = []
    missed = 1
    for input_string in list_string:
        # Find the last valid dictionary's end position
        # ipdb.set_trace()

        if input_string[-1] == "]":
            # Convert to q
            try:
                list_of_dicts = ast.literal_eval(input_string)
            except :
                # TODO: We need a best way to deal with this 
                # print(f"missed parse {missed}")
                missed += 1
                continue 
            
            return_list.append(list_of_dicts)
            continue 
        elif "[" not in input_string:
            return_list.append(input_string)
            continue 
        else:
            end_pos = input_string.rfind('}}') + 2
            cleaned_string = input_string[:end_pos] + " ]"
            # ipdb.set_trace()
            
            # Convert to q
            try:
                list_of_dicts = ast.literal_eval(cleaned_string)
            except :
                # TODO: We need a best way to deal with this 
                # print(f"missed parse {missed}")
                missed += 1
                continue 
    
            return_list.append(list_of_dicts)
    # ipdb.set_trace()  
    print(f"All missed: {missed}")
    return return_list

def calculate_fuzz_ratio(text1, text2):
    return fuzz.ratio(str(text1).strip().lower(), str(text2).strip().lower())

strict_list_tasks = []
partial_list_tasks = []
strict_list_datasets = []
partial_list_datasets = []
strict_list_metrics = []
partial_list_metrics = []
strict_list_scores = []
partial_list_scores = []

results = {}
def compute_metrics(eval_preds):

    preds, labels = eval_preds
    # ipdb.set_trace()
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)  # type: ignore
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)  # type: ignore
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    results = Metrics.evaluate_property_wise_json_based(label_list=decoded_labels, prediction_list=decoded_preds)

    # print(results)
    # ipdb.set_trace()
    
    results.update(Metrics.evaluate_rouge(label_list=decoded_labels, prediction_list=decoded_preds))

    # print(results)
    
    
    return results


    
def tokenize_function(sample):
    # tokenize inputs
    model_inputs = tokenizer(sample["prompt"], max_length=script_args.max_source_length, 
                             padding="max_length", truncation=True,
                             return_tensors="pt")

    
    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["answer"], max_length=script_args.max_target_length, padding="max_length",
                       truncation=True, return_tensors="pt")

    # # Check if the length of labels is >= 512
    # if any(len(label) >= 512 for label in labels["input_ids"]):
    #     return {}  # Return empty dict to skip this example

    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]  # type: ignore
    ]
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [20]:
# train_tokenized_dataset = train_dataset.map(tokenize_function, batched=True,
#                                             # remove_columns=dataset_columns_to_remove
#                                             )
eval_tokenized_dataset = eval_dataset.map(tokenize_function, batched=True,
                                        #   remove_columns=dataset_columns_to_remove
                                          )
# print(f"Keys of tokenized dataset: {list(train_tokenized_dataset.features)}")

In [21]:
optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
lr_scheduler = AdafactorSchedule(optimizer)
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=10, early_stopping_threshold=0.001)

In [35]:
training_args = Seq2SeqTrainingArguments(
    output_dir=script_args.output_dir,
    # per_device_train_batch_size=script_args.per_device_train_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
    learning_rate=script_args.learning_rate,
    logging_steps=script_args.logging_steps,
    # report_to=script_args.log_with,
    save_steps=script_args.save_steps,
    save_total_limit=script_args.save_total_limit,
    lr_scheduler_type=script_args.lr_scheduler_type,
    warmup_steps=script_args.num_warmup_steps,
    # optim=script_args.optimizer_type
    num_train_epochs=script_args.num_train_epochs,
    run_name=script_args.run_name,
    
    predict_with_generate=True,
    generation_max_length=script_args.max_target_length,
    
    load_best_model_at_end=True,
    # metric_for_best_model=metric_name,
    # greater_is_better=True,
    
    # logging_dir=f"{model_save_path}/logs",
    # eval_steps=500,  # Evaluate the model every 500 steps,
    evaluation_strategy="epoch",
    # logging_strategy="steps",
    save_strategy="epoch", # steps
    # push_to_hub=False,    
    # seed=seed
)


# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    # train_dataset=train_tokenized_dataset,
    eval_dataset=eval_tokenized_dataset,
    # max_seq_length=script_args.seq_length,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler),
    # callbacks=([early_stopping_callback])

)

print(f"Test data {script_args.test_dataset}")

# # trainer.train()
results_output = trainer.evaluate()
print(f"Results:")
for key, value in results_output.items():
    print(f"{key}: {value}")

Test data ../data/LLLM_DOCTEAT_TDMS_DROP_1/fold2


Results:
eval_loss: 0.4898712635040283
eval_general_accuracy: 99.15
eval_exact_recalls_task: 53.14
eval_exact_recalls_dataset: 39.48
eval_exact_recalls_metric: 46.49
eval_exact_recalls_Score: 0.37
eval_exact_recalls_overall: 34.87
eval_partial_recalls_task: 57.2
eval_partial_recalls_dataset: 44.28
eval_partial_recalls_metric: 48.34
eval_partial_recalls_Score: 0.73
eval_partial_recalls_overall: 37.64
eval_exact_precisions_task: 49.15
eval_exact_precisions_dataset: 36.52
eval_exact_precisions_metric: 43.0
eval_exact_precisions_Score: 0.34
eval_exact_precisions_overall: 32.25
eval_partial_precisions_task: 52.9
eval_partial_precisions_dataset: 40.96
eval_partial_precisions_metric: 44.71
eval_partial_precisions_Score: 0.68
eval_partial_precisions_overall: 34.81
eval_exact_f1s_task: 51.06
eval_exact_f1s_dataset: 37.94
eval_exact_f1s_metric: 44.68
eval_exact_f1s_Score: 0.35
eval_exact_f1s_overall: 33.51
eval_partial_f1s_task: 54.96
eval_partial_f1s_dataset: 42.55
eval_partial_f1s_metric: 46.4

In [15]:
# ast.literal_eval(data)
# results_output

In [16]:
# script_args.test_ckpt

In [17]:
# pint()

In [18]:
# from datasets import load_metric

# rouge = evaluate.load("rouge")

# predictions = ['unanswerable', 'unanswerable', 'unanswerable', "[{ 'LEADERBOARD': { 'Task': '3D Human Pose Estimation', 'Dataset': '3D Poses in the Wild Challenge', 'Metric': 'MPJPE', 'Score': '68.83'}} ]", "[{ 'LEADERBOARD': { 'Task': 'Image Classification', 'Dataset': 'ImageNet', 'Metric': 'Top 1 Accuracy', 'Score': '79.6%'}}, { 'LEADERBOARD': { 'Task': 'Image Classification', 'Dataset': 'Flowers-102', 'Metric': 'Accuracy', 'Score': '99.1%'}}, { 'LEADERBOARD': { 'Task': 'Image Classification', 'Dataset': 'iNaturalist 2018', 'Metric': 'Top-1 Accuracy', 'Score': '81.2%'}}, { 'LEADERBOARD': { 'Task': 'Image Classification', 'Dataset': 'iNaturalist 2018', 'Metric': 'Top-1 Accuracy', 'Score': '69.8%'}}, { 'LEADERBOARD': { 'Task': 'Image Classification', 'Dataset': 'CIFAR-100', 'Metric': 'Percentage correct', 'Score': '83.7'}}, { 'LEADERBOARD': { 'Task': 'Image Classification', 'Dataset': 'iNaturalist 2019', 'Metric': 'Top-1 Accuracy', 'Score': '84.1'}}, { 'LEADERBOARD': { 'Task': 'Fine-Grained Image Classification', 'Dataset': 'Oxford 102 Flowers', 'Metric': 'Accuracy', 'Score': '99.1%'}}"]
# references = ['unanswerable', 'unanswerable', 'unanswerable', "[{ 'LEADERBOARD': { 'Task': '3D Human Pose Estimation', 'Dataset': '3D Poses in the Wild Challenge', 'Metric': 'MPJPE', 'Score': '68.83'}} ]", "[{ 'LEADERBOARD': { 'Task': 'Image Classification', 'Dataset': 'ImageNet', 'Metric': 'Top 1 Accuracy', 'Score': '79.6%'}}, { 'LEADERBOARD': { 'Task': 'Image Classification', 'Dataset': 'Flowers-102', 'Metric': 'Accuracy', 'Score': '99.1%'}}, { 'LEADERBOARD': { 'Task': 'Image Classification', 'Dataset': 'iNaturalist 2018', 'Metric': 'Top-1 Accuracy', 'Score': '81.2%'}}, { 'LEADERBOARD': { 'Task': 'Image Classification', 'Dataset': 'iNaturalist 2018', 'Metric': 'Top-1 Accuracy', 'Score': '69.8%'}}, { 'LEADERBOARD': { 'Task': 'Image Classification', 'Dataset': 'CIFAR-100', 'Metric': 'Percentage correct', 'Score': '83.7'}}, { 'LEADERBOARD': { 'Task': 'Image Classification', 'Dataset': 'iNaturalist 2019', 'Metric': 'Top-1 Accuracy', 'Score': '84.1'}}, { 'LEADERBOARD': { 'Task': 'Fine-Grained Image Classification', 'Dataset': 'Oxford 102 Flowers', 'Metric': 'Accuracy', 'Score': '99.1%'}}"]

# # predictions = ['unanswerable', 'unanswerable', 'unanswerable']
# # references = ['unanswerable', 'unanswerable', 'unanswerable']

# scores = rouge.compute(predictions=predictions, 
#                        references=predictions,
#                        use_stemmer=True
#                       )

# print(scores)

In [19]:
# import json

# input_string = "[{ 'LEADERBOARD': { 'Task': 'Person Re-Identification', 'Dataset': 'Market-1501', 'Metric': 'MAP', 'Score': '89.5'}}, { 'LEADERBOARD': { 'Task': 'Person Re-Identification', 'Dataset': 'Market-1501', 'Metric': 'Rank-1', 'Score': '95.7'}}, { 'LEADERBOARD': { 'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'MAP', 'Score': '81.84'}}, { 'LEADERBOARD': { 'Task': 'Person Re-Identification', 'Dataset': 'DukeMTMC-reID', 'Metric': 'Rank-1', 'Score': '91.11'}} ]"
# input_string = '[{ "contribution": { "disease name": "COVID-19", "location": "Switzerland", "date": "start of the epidemic", "R0 value": "3.15", "%CI values": "95% CI: 2.13-3.76", "method": "stochastic transmission model explicitly simulating within hospital dynamics"}}, {"contribution": {"disease name": "COVID-19","location": "Switzerland","date": "March 29-April 5","R0 value": "0.44","%CI values": "95% QR: 0.27-0.65","method": "stochastic transmission model explicitly simulating within hospital dynamics"}}]'

# input_string = "[{ 'LEADERBOARD': { 'Task': 'Reading Comprehension', 'Dataset': 'AdversarialQA', 'Metric': 'D(BERT): F1', 'Score': '34.12'}}, { 'LEADERBOARD': { 'Task': 'Reading Comprehension', 'Dataset': 'AdversarialQA', 'Metric': 'D(BiDAF): F1', 'Score': '31.: }}, { 'LEADERBOARD': { 'Task': 'Reading Comprehension', 'Dataset': 'AdversarialQA', 'Metric': 'D(RaBERT): F1', 'Score': '30.61'}}, { 'LEADERBOARD': { 'Task': 'Reading Comprehension', 'Dataset': 'AdversarialQA', 'Metric': 'Overall: F1', 'Score': '80.38'}}, { 'LEADERBOARD': { 'Task': 'Reading Comprehension', 'Dataset': 'AdversarialQA', 'Metric': 'D(BERT): F1', 'Score': '34.09'}}, { 'LEADERBOARD': { 'Task': 'Reading Comprehension', 'Dataset': 'AdversarialQA', 'Metric': 'D(BiDAF): F1', 'Score': '34.58'}} ]"

In [20]:
 # ast.literal_eval(input_string)

In [21]:
# json.loads(
#     # '[{"contribution": { "disease name": "COVID-19", "location": "Switzerland", "date": "start of the epidemic", "R0 value": "3.15", "%CI values": "95% CI: 2.13-3.76", "method": "stochastic transmission model explicitly simulating within hospital dynamics"}}]'
#     input_string
# )

In [22]:
# 'Dataset'.lower()

In [23]:
# 'threshold'.upper()

In [24]:
# fuzz.ratio("hello", "hola")

In [25]:
# calculate_fuzz_ratio("hello", "Task jhgjkjg")

In [26]:
# import ast

# def clean_and_parse(input_string):
#     # Find the last valid dictionary's end position
#     end_pos = input_string.rfind('}}') + 2
#     cleaned_string = input_string[:end_pos] + " ]"
#     # ipdb.set_trace()
    
#     # Convert to q
    
#     list_of_dicts = ast.literal_eval(cleaned_string)
#     return list_of_dicts

# input_str = "[{ 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'enwik8', 'Metric': 'Bit per Character (BPC)'}}, { 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'enwik8', 'Metric': 'Number of params'}}, { 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'enwik8', 'Metric': 'Bit per Character (BPC)'}}, { 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'enwik8', 'Metric': 'Number of params'}}, { 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'Hutter Prize', 'Metric': 'Bit per Character (BPC)'}}, { 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'Hutter Prize', 'Metric': 'Number of params'}}, { 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'Hutter Prize', 'Metric': 'Bit per Character (BPC)'}}, { 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'Hutter Prize', 'Metric': "

# parsed_data = clean_and_parse(input_str)
# print(parsed_data)


## Inference on a sample Input

In [27]:
# len(eval_dataset)
# random.sample(range(len(eval_dataset)), 500)


In [28]:
# idx = random.randint(0, len(eval_dataset))

# eval_dataset[idx]
# # # 

In [29]:
# preds = np.where(preds != -100, preds, tokenizer.pad_token_id)  # type: ignore
# decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
# # Replace -100 in the labels as we can't decode them.
# labels = np.where(labels != -100, labels, tokenizer.pad_token_id)  # type: ignore
# decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# inputs
# outputs

In [30]:
# sample_input = eval_dataset[idx]['prompt']
# # inputs = tokenizer.encode(sample_input, return_tensors="pt").to(device)
# inputs = tokenizer.encode(sample_input, max_length=script_args.max_source_length, 
#                              padding="max_length", truncation=True,
#                              return_tensors="pt").to(device)

# inputs = torch.where(inputs != -100, inputs, tokenizer.pad_token_id)
# # inputs = [[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in inputs] # type: ignor

# # inputs = np.where(preds != -100, inputs, tokenizer.pad_token_id)  # type: ignore

# # model = model.to(device)
# with torch.no_grad():
#     outputs = model.generate(inputs)

# outputs = torch.where(outputs != -100, outputs, tokenizer.pad_token_id)
# # # decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
# decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

# print(decoded_output)

In [31]:
# preds = np.where(preds != -100, preds, tokenizer.pad_token_id)  # type: ignore
# decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

In [32]:
# predictions, labels, metrics = trainer.predict(
#     tokenizer.encode(
#         eval_dataset[idx]['prompt'], 
#         return_tensors="pt"
#     )
# )
# outputs

In [33]:
# input_text = eval_dataset[idx]['prompt']
# input_tokenized = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# predictions, metrics  = trainer.predict([input_tokenized])


## trainer.predict(inputs)

In [36]:
# predict_dataset = dataset["validation"].shard(num_shards=20, index=0)
# predict_dataset = dataset["train"].shard(num_shards=2, index=0)
predict_dataset = eval_dataset
len(predict_dataset)

471

In [37]:
predict_dataset[0]

{'prompt': 'Answer based on context:\n\nGeometric and Combinatorial Properties of Well-Centered Triangulations in Three and Higher Dimensions An n-simplex is said to be n-well-centered if its circumcenter lies in its interior. We introduce several other geometric conditions and an algebraic condition that can be used to determine whether a simplex is n-well-centered. These conditions, together with some other observations, are used to describe restrictions on the local combinatorial structure of simplicial meshes in which every simplex is well-centered. In particular, it is shown that in a 3-well-centered (2well-centered) tetrahedral mesh there are at least 7 (9) edges incident to each interior vertex, and these bounds are sharp. Moreover, it is shown that, in stark contrast to the 2-dimensional analog, where there are exactly two vertex links that prevent a well-centered triangle mesh in R 2 , there are infinitely many vertex links that prohibit a well-centered tetrahedral mesh in R 3

In [38]:
print(f"len predict dataset {len(predict_dataset)}")
eval_tokenized_dataset = predict_dataset.map(tokenize_function, batched=True,
                                        #   remove_columns=dataset_columns_to_remove
                                          )

predictions, labels, metrics = trainer.predict(eval_tokenized_dataset)

len predict dataset 471


Map:   0%|          | 0/471 [00:00<?, ? examples/s]

In [43]:
# metrics
idx = random.randint(0, len(predict_dataset))
print(f"Index {idx}\n")
print("\nContext:\n")
print(predict_dataset[idx]['prompt'].split("\n\n")[1])
print("\nLabel:")
print(predict_dataset[idx]['answer'])

decoded_output = tokenizer.decode(predictions[idx], skip_special_tokens=True)
print("\nPredictions:")
print(decoded_output)


Index 12


Context:

Accurate 3D Object Detection using Energy-Based Models Accurate 3D object detection (3DOD) is crucial for safe navigation of complex environments by autonomous robots. Regressing accurate 3D bounding boxes in cluttered environments based on sparse LiDAR data is however a highly challenging problem. We address this task by exploring recent advances in conditional energy-based models (EBMs) for probabilistic regression. While methods employing EBMs for regression have demonstrated impressive performance on 2D object detection in images, these techniques are not directly applicable to 3D bounding boxes. In this work, we therefore design a differentiable pooling operator for 3D bounding boxes, serving as the core module of our EBM network. We further integrate this general approach into the state-of-the-art 3D object detector SA-SSD. On the KITTI dataset, our proposed approach consistently outperforms the SA-SSD baseline across all 3DOD metrics, demonstrating the poten

In [40]:
idx

327

In [41]:
predict_dataset[idx]['prompt'].split("\n\n")[1]

'LiteSeg: A Novel Lightweight ConvNet for Semantic Segmentation Semantic image segmentation plays a pivotal role in many vision applications including autonomous driving and medical image analysis. Most of the former approaches move towards enhancing the performance in terms of accuracy with a little awareness of computational efficiency. In this paper, we introduce LiteSeg, a lightweight architecture for semantic image segmentation. In this work, we explore anew deeper version of Atrous Spatial Pyramid Pooling module (ASPP) and apply short and long residual connections, and depthwise separable convolution, resulting in a faster and efficient model. LiteSeg architecture is introduced and tested with multiple backbone networks as Darknet19, MobileNet, and ShuffleNet to provide multiple trade-offs between accuracy and computational cost. The proposed model LiteSeg, with MobileNetV2 as a backbone network, achieves an accuracy of 67.81% mean intersection over union at 161 frames per second

In [51]:
import json

# json_string = json.dumps(my_list, indent=4)  # indent is optional, it makes the output more readable


# with open('output.json', 'w') as f:
#     json.dump(my_list, f, indent=4)  # indent is optional



In [57]:
len(predict_dataset)

471

In [59]:
prediction_list = []

for idx in range(450):
    # metrics
    # idx = random.randint(0, len(predict_dataset))
    # print(f"Index {idx}\n")
    # print("\nPrompt:\n")
    # predict_dataset[idx]['prompt'].split("\n\n")[1])
    # print("\nLabel:")
    # print(predict_dataset[idx]['answer'])
    
    decoded_output = tokenizer.decode(predictions[idx], skip_special_tokens=True)
    # print("\nPredictions:")
    # print(decoded_output)


    prediction_list.append(
        {
            "Context": predict_dataset[idx]['prompt'].split("\n\n")[1],
            "PWC Annotation": predict_dataset[idx]['answer'],
            "LLM prediction": decoded_output
        }
    )


In [61]:
prediction_list[430]

{'Context': "Learning Geometry-Disentangled Representation for Complementary Understanding of 3D Object Point Cloud In 2D image processing, some attempts decompose images into high and low frequency components for describing edge and smooth parts respectively. Similarly, the contour and flat area of 3D objects, such as the boundary and seat area of a chair, describe different but also complementary geometries. However, such investigation is lost in previous deep networks that understand point clouds by directly treating all points or local patches equally. To solve this problem, we propose Geometry-Disentangled Attention Network (GDANet). GDANet introduces Geometry-Disentangle Module to dynamically disentangle point clouds into the contour and flat part of 3D objects, respectively denoted by sharp and gentle variation components. Then GDANet exploits Sharp-Gentle Complementary Attention Module that regards the features from sharp and gentle variation components as two holistic represen

In [62]:
with open('../validation_drop_1_fold2_json_leaderboard.json', 'w') as f:
    json.dump(prediction_list, f, indent=4)  # indent is optional