In [1]:
# Fine-Tune Llama2-7b on custom dataset
import os, ipdb
import random

import numpy as np
import torch
from fuzzywuzzy import fuzz

import pandas as pd
import ast
from tqdm import tqdm
from dataclasses import dataclass, field
from typing import Optional

from datasets import DatasetDict, Dataset, load_from_disk
from tokenizers import AddedToken
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, EarlyStoppingCallback
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, HfArgumentParser
from transformers.optimization import Adafactor, AdafactorSchedule

import random, evaluate


# from ../evaluation_metrics import Metrics
seed = 42
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

os.environ["TOKENIZERS_PARALLELISM"] = "false" # or "true", depending on your needs

# pd.options.display.max_rows , pd.options.display.max_columns  = 100,100  

device = 'cuda' if torch.cuda.is_available() else "cpu"
device

2023-09-10 20:01:04.044937: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'cuda'

In [2]:
# !pip install fuzzywuzzy --user

In [3]:
# import ipywidgets as widget
# widget.IntSlider()

In [4]:
# from huggingface_hub import notebook_login
# notebook_login()

In [5]:
@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default="google/flan-t5", metadata={"help": "the model name"})
    log_with: Optional[str] = field(default="wandb", metadata={"help": "use 'wandb' to log with wandb"})

    dataset_name: Optional[str] = field(default="lvwerra/stack-exchange-paired", metadata={"help": "the dataset name"})
    subset: Optional[str] = field(default="data/finetune", metadata={"help": "the subset to use"})
    split: Optional[str] = field(default="train", metadata={"help": "the split to use"})
    size_valid_set: Optional[int] = field(default=4000, metadata={"help": "the size of the validation set"})
    streaming: Optional[bool] = field(default=True, metadata={"help": "whether to stream the dataset"})
    shuffle_buffer: Optional[int] = field(default=5000, metadata={"help": "the shuffle buffer size"})
    seq_length: Optional[int] = field(default=1024, metadata={"help": "the sequence length"})
    num_workers: Optional[int] = field(default=4, metadata={"help": "the number of workers"})

    max_steps: Optional[int] = field(default=500, metadata={"help": "the maximum number of sgd steps"})
    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
    save_steps: Optional[int] = field(default=10, metadata={"help": "the saving frequency"})
    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "the per device train batch size"})
    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "the per device eval batch size"})
    gradient_accumulation_steps: Optional[int] = field(default=2, metadata={"help": "the gradient accumulation steps"})
    gradient_checkpointing: Optional[bool] = field(
        default=True, metadata={"help": "whether to use gradient checkpointing"}
    )
    group_by_length: Optional[bool] = field(default=False, metadata={"help": "whether to group by length"})
    packing: Optional[bool] = field(default=True, metadata={"help": "whether to use packing for SFTTrainer"})

    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})

    learning_rate: Optional[float] = field(default=1e-4, metadata={"help": "the learning rate"})
    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
    num_warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})

    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})


parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses([])[0]


if script_args.group_by_length and script_args.packing:
    raise ValueError("Cannot use both packing and group by length")
# script_args.per_device_train_batch_size,
script_args.gradient_accumulation_steps,
# script_args.per_device_eval_batch_size,

script_args.seq_length

# script_args.dataset_name = "./data/LLLM_TDMS_ALL_TEMPLATE/fold1"
# script_args.output_dir = "./model_ckpt/tdms_all_template_v2"
# script_args.run_name = "sft_llama2_tdms_all_Template_v2"

# script_args.dataset_name = "./data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/fold2"
# script_args.output_dir = "./model_ckpt/docteat_tdm_f2_all_template"
# script_args.run_name = "sft_llama2_docteat_tdm_f2_all_Template"

script_args.model_name = "google/flan-t5"
script_args.size = "large"
script_args.test_ckpt = "checkpoint-20674"

script_args.dataset_name = "../data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/fold2"
script_args.output_dir = f"../model_ckpt/docteat_flan_t5_{script_args.size}_tdm_f2_all_template"
script_args.run_name = f"sft_docteat_flan_t5_{script_args.size}_tdm_f2_all_Template"

# script_args.dataset_name = "./data/LLLM_DOCTEAT_TDM_ALL_TEMPLATE/fold2"
# script_args.output_dir = f"./model_ckpt/docteat_flan_t5_{script_args.size}_tdm_f2_all_template"
# script_args.run_name = f"sft_docteat_flan_t5_{script_args.size}_tdm_f2_all_Template"

script_args.seq_length = 512
script_args.per_device_train_batch_size = 4
script_args.gradient_accumulation_steps = 2
script_args.per_device_eval_batch_size = 24
script_args.max_source_length = 512
script_args.max_target_length = 512
script_args.label_pad_token_id = -100
script_args.pad_to_multiple_of = 8
script_args.model_max_length = 512

# # multi GPU
# script_args.per_device_train_batch_size = 4

# script_args.dataset_name = "./data/LLLM_LONG_TDM_ALL_TEMPLATE/fold1"
# script_args.output_dir = "./model_ckpt/long_tdm_f1_all_template"
# script_args.run_name = "sft_llama2_long_tdm_f1_all_Template"
# script_args.seq_length = 2400
# script_args.per_device_train_batch_size = 2
# script_args.gradient_accumulation_steps = 2

script_args.save_steps = 50
script_args.logging_steps = 50
script_args.streaming = False
script_args.num_train_epochs = 5
script_args.save_total_limit = 10
script_args.fuzz_ratio = 80

In [6]:
script_args.seq_length

512

In [7]:
tokenizer = AutoTokenizer.from_pretrained(f"{script_args.model_name}-{script_args.size}")

tokenizer.add_tokens(AddedToken("\n", normalized=False))
tokenizer.add_tokens(AddedToken("{", normalized=False))
tokenizer.add_tokens(AddedToken("}", normalized=False))

# model = AutoModelForSeq2SeqLM.from_pretrained(f"{script_args.model_name}-{script_args.size}")

# tokenizer = AutoTokenizer.from_pretrained(f"{script_args.output_dir}/{script_args.test_ckpt}")
model = AutoModelForSeq2SeqLM.from_pretrained(f"{script_args.output_dir}/{script_args.test_ckpt}")


data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=script_args.label_pad_token_id,
    pad_to_multiple_of=script_args.label_pad_token_id
)

print(f"Max token lenght: {tokenizer.model_max_length}")

num_gpus = torch.cuda.device_count()

print(f"Max token lenght: {tokenizer.model_max_length}")
print(f"Batch size: {script_args.per_device_train_batch_size * script_args.gradient_accumulation_steps * num_gpus }")
print(f"Number of GPUs available: {num_gpus}")

Max token lenght: 512
Max token lenght: 512
Batch size: 8
Number of GPUs available: 1


In [8]:
dataset = DatasetDict.load_from_disk(f"{script_args.dataset_name}")

dataset = dataset.shuffle(seed=seed)

# train_dataset = dataset["train"]
# eval_dataset = dataset["validation"]
train_dataset = dataset["train"].shard(num_shards=1000, index=0)
eval_dataset = dataset["validation"].shard(num_shards=4, index=0)

print(f"length train_dataset: {len(train_dataset)}")
print(f"length eval_dataset: {len(eval_dataset)}")

length train_dataset: 83
length eval_dataset: 8820


In [9]:
# eval_dataset[0]

In [10]:
def clean_and_parse(list_string):
    return_list = []
    for input_string in list_string:
        # Find the last valid dictionary's end position
        # ipdb.set_trace()

        if input_string[-1] == "]":
            # Convert to q
            list_of_dicts = ast.literal_eval(cleaned_string)
            return_list.append(list_of_dicts)
            continue 
        elif "[" not in input_string:
            return_list.append(input_string)
            continue 
        else:
            end_pos = input_string.rfind('}}') + 2
            cleaned_string = input_string[:end_pos] + " ]"
            # ipdb.set_trace()
            
            # Convert to q
            list_of_dicts = ast.literal_eval(cleaned_string)
    
            return_list.append(list_of_dicts)
        
    return return_list

def calculate_fuzz_ratio(text1, text2):
    return fuzz.ratio(str(text1).strip().lower(), str(text2).strip().lower())

strict_list_tasks = []
partial_list_tasks = []
strict_list_datasets = []
partial_list_datasets = []
strict_list_metrics = []
partial_list_metrics = []
strict_list_scores = []
partial_list_scores = []

results = {}
def compute_metrics(eval_preds):

    preds, labels = eval_preds
    # ipdb.set_trace()
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)  # type: ignore
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)  # type: ignore
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
    results["General"] = clf_metrics.compute(predictions=[1 if "unanswerable" == x.replace("</s>", "") else 0 for x in decoded_preds], 
                        references=[1 if "unanswerable" == x else 0 for x in decoded_labels]
    )

    rouge = evaluate.load('rouge')
    
    rouge_results = rouge.compute(
        predictions=[pred.replace("</s>", "") for pred in decoded_preds],
        references=decoded_labels
    )
    results["General"].update(rouge_results) 
    
    decoded_preds_clean = clean_and_parse(decoded_preds)
    decoded_labels_clean = clean_and_parse(decoded_labels)

    for labels, preds in tqdm(zip(decoded_labels_clean, decoded_preds_clean), total = len(decoded_labels_clean)):
        
        
        if labels == "unanswerable":
            continue

        if "unanswerable" in preds:
            ipdb.set_trace()
            continue
            
        for label, pred in tqdm(zip(labels, preds), total = len(labels)):
            if "Task" in pred['LEADERBOARD'].keys():
                if label['LEADERBOARD']["Task"] == pred['LEADERBOARD']["Task"]:
                    strict_list_tasks.append(1)
                else:
                    strict_list_tasks.append(0)
                if calculate_fuzz_ratio(label['LEADERBOARD']["Task"], pred['LEADERBOARD']["Task"])>= script_args.fuzz_ratio:
                    partial_list_tasks.append(1)
                else:
                    partial_list_tasks.append(0)
            else:
                strict_list_tasks.append(0)
                partial_list_tasks.append(0)

            if "Dataset" in pred['LEADERBOARD'].keys():
                if label['LEADERBOARD']["Dataset"] == pred['LEADERBOARD']["Dataset"]:
                    strict_list_datasets.append(1)
                else:
                    strict_list_datasets.append(0)
                if calculate_fuzz_ratio(label['LEADERBOARD']["Dataset"], pred['LEADERBOARD']["Dataset"])>= script_args.fuzz_ratio:
                    partial_list_datasets.append(1)
                else:
                    partial_list_datasets.append(0)
            else:
                strict_list_tasks.append(0)
                partial_list_tasks.append(0)
                
            if "Metric" in pred['LEADERBOARD'].keys():    
                if label['LEADERBOARD']["Metric"] == pred['LEADERBOARD']["Metric"]:
                    strict_list_metrics.append(1)
                else:
                    strict_list_metrics.append(0)
                if calculate_fuzz_ratio(label['LEADERBOARD']["Metric"], pred['LEADERBOARD']["Metric"])>= script_args.fuzz_ratio:
                    partial_list_metrics.append(1)
                else:
                    partial_list_metrics.append(0)
            else:
                strict_list_tasks.append(0)
                partial_list_tasks.append(0)
                
            # if "Score" in pred['LEADERBOARD'].keys():     
            #     if label['LEADERBOARD']["Score"] == pred['LEADERBOARD']["Score"]:
            #         strict_list_scores.append(1)
            #     else:
            #         strict_list_scores.append(0)
            #     if calculate_fuzz_ratio(label['LEADERBOARD']["Score"], pred['LEADERBOARD']["Score"])>= 80:
            #         partial_list_scores.append(1)
            #     else:
            #         partial_list_scores.append(0)
            # else:
            #     strict_list_tasks.append(0)
            #     partial_list_tasks.append(0)
                
                
    results["Tasks STRICT"] = clf_metrics.compute(predictions=strict_list_tasks, 
                        references=[1]*len(strict_list_tasks))
    results["Tasks PARTIAL"] = clf_metrics.compute(predictions=partial_list_tasks, 
                        references=[1]*len(partial_list_tasks))
    results["Datasets STRICT"] = clf_metrics.compute(predictions=strict_list_datasets, 
                        references=[1]*len(strict_list_datasets))
    results["Datasets PARTIAL"] = clf_metrics.compute(predictions=partial_list_datasets, 
                        references=[1]*len(partial_list_datasets))
    results["Metrics STRICT"] = clf_metrics.compute(predictions=strict_list_metrics, 
                        references=[1]*len(strict_list_metrics))
    results["Metrics PARTIAL"] = clf_metrics.compute(predictions=partial_list_metrics, 
                        references=[1]*len(partial_list_metrics))
    # results["Scores STRICT"] = clf_metrics.compute(predictions=strict_list_scores, 
    #                     references=[1]*len(strict_list_scores))
    # results["Scores PARTIAL"] = clf_metrics.compute(predictions=partial_list_scores, 
    #                     references=[1]*len(partial_list_scores))
    
    # ipdb.set_trace()
    
    # result = Metrics.evaluate_property_wise_text_based(label_list=decoded_labels, prediction_list=decoded_preds)
    # result.update(Metrics.evaluate_rouge(label_list=decoded_labels, prediction_list=decoded_preds))
    return results


    
def tokenize_function(sample):
    # tokenize inputs
    model_inputs = tokenizer(sample["prompt"], max_length=script_args.max_source_length, 
                             padding="max_length", truncation=True,
                             return_tensors="pt")

    
    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["answer"], max_length=script_args.max_target_length, padding="max_length",
                       truncation=True, return_tensors="pt")

    # # Check if the length of labels is >= 512
    # if any(len(label) >= 512 for label in labels["input_ids"]):
    #     return {}  # Return empty dict to skip this example

    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]  # type: ignore
    ]
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [11]:
train_tokenized_dataset = train_dataset.map(tokenize_function, batched=True,
                                            # remove_columns=dataset_columns_to_remove
                                            )
eval_tokenized_dataset = eval_dataset.map(tokenize_function, batched=True,
                                        #   remove_columns=dataset_columns_to_remove
                                          )
print(f"Keys of tokenized dataset: {list(train_tokenized_dataset.features)}")

Map:   0%|          | 0/8820 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['prompt', 'answer', '__index_level_0__', 'input_ids', 'attention_mask', 'labels']


In [12]:
optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
lr_scheduler = AdafactorSchedule(optimizer)
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=10, early_stopping_threshold=0.001)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=script_args.output_dir,
    per_device_train_batch_size=script_args.per_device_train_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
    learning_rate=script_args.learning_rate,
    logging_steps=script_args.logging_steps,
    # report_to=script_args.log_with,
    save_steps=script_args.save_steps,
    save_total_limit=script_args.save_total_limit,
    lr_scheduler_type=script_args.lr_scheduler_type,
    warmup_steps=script_args.num_warmup_steps,
    # optim=script_args.optimizer_type
    num_train_epochs=script_args.num_train_epochs,
    run_name=script_args.run_name,
    
    predict_with_generate=True,
    generation_max_length=script_args.max_target_length,
    
    load_best_model_at_end=True,
    # metric_for_best_model=metric_name,
    # greater_is_better=True,
    
    # logging_dir=f"{model_save_path}/logs",
    # eval_steps=500,  # Evaluate the model every 500 steps,
    evaluation_strategy="epoch",
    # logging_strategy="steps",
    save_strategy="epoch", # steps
    # push_to_hub=False,    
    # seed=seed
)


# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized_dataset,
    eval_dataset=eval_tokenized_dataset,
    # max_seq_length=script_args.seq_length,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler),
    callbacks=([early_stopping_callback])

)

# trainer.train()
trainer.evaluate()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
fuzz.ratio("hello", "hola")

In [None]:
calculate_fuzz_ratio("hello", "Task jhgjkjg")

In [None]:
# import ast

# def clean_and_parse(input_string):
#     # Find the last valid dictionary's end position
#     end_pos = input_string.rfind('}}') + 2
#     cleaned_string = input_string[:end_pos] + " ]"
#     # ipdb.set_trace()
    
#     # Convert to q
    
#     list_of_dicts = ast.literal_eval(cleaned_string)
#     return list_of_dicts

# input_str = "[{ 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'enwik8', 'Metric': 'Bit per Character (BPC)'}}, { 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'enwik8', 'Metric': 'Number of params'}}, { 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'enwik8', 'Metric': 'Bit per Character (BPC)'}}, { 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'enwik8', 'Metric': 'Number of params'}}, { 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'Hutter Prize', 'Metric': 'Bit per Character (BPC)'}}, { 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'Hutter Prize', 'Metric': 'Number of params'}}, { 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'Hutter Prize', 'Metric': 'Bit per Character (BPC)'}}, { 'LEADERBOARD': { 'Task': 'Language Modelling', 'Dataset': 'Hutter Prize', 'Metric': "

# parsed_data = clean_and_parse(input_str)
# print(parsed_data)


## Inference on a sample Input

In [None]:
# eval_dataset[0]
# # 

In [None]:
# sample_input = eval_dataset[5]['prompt']
# inputs = tokenizer.encode(sample_input, return_tensors="pt").to(device)

# with torch.no_grad():
#     outputs = model.generate(inputs)

# decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print(decoded_output)

In [None]:
# outputs

In [None]:
# trainer.predict(inputs)