In [42]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines

#from utilities import *
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from llama import BasicModelRunner
from datasets import Dataset
from datasets import load_dataset
from pprint import pprint
from tqdm import tqdm
logger = logging.getLogger(__name__)
global_config = None

In [None]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForCausalLM
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers.trainer_callback import TrainerCallback
from transformers import BitsAndBytesConfig
from trl import SFTTrainer
import mlflow

In [None]:
dataset_name = 's2e-lab/RegexEval'
model_name = 'openlm-research/open_llama_3b_v2'


In [3]:
raw_dataset = load_dataset('s2e-lab/RegexEval', split='train')

In [6]:
import pandas as pd

instruction_dataset_df = pd.DataFrame(raw_dataset)
instruction_dataset_dict = instruction_dataset_df.to_dict()

In [7]:
instruction_dataset_df.head(2)

Unnamed: 0,expression,raw_prompt,refined_prompt,matches,non_matches,id
0,^\d$,Matches exactly 1 numeric digit (0-9).,Matches exactly 1 numeric digit (0-9).\nMatch ...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 0]","[a, 324, num, location = 3, ssda, 11, hello wo...",1
1,^\d{5}$,"Matches 5 numeric digits, such as a zip code.","Matches 5 numeric digits, such as a zip code.\...","[33333, 55555, 23445, 89343, 46556, 25432, 253...","[abcd, 1324, as;lkjdf, jaldks, 234, 8hr4f, fsd...",2


In [22]:
instruction_dataset_df['refined_prompt'].apply(lambda x: len(x.split())).max()

144

In [None]:
#Combine the two attributes into an instruction string
rd_df['instruction'] = 'Create a detailed description for the following product: '+ rd_df['product']+', belonging to category: '+ rd_df['category']

rd_df = rd_df[['instruction', 'description']]

#Get a 5000 sample subset for fine-tuning purposes
rd_df_sample = rd_df.sample(n=5000, random_state=42)

#Define template and format data into the template for supervised fine-tuning
template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:

{}

### Response:\n"""

rd_df_sample['prompt'] = rd_df_sample["instruction"].apply(lambda x: template.format(x))
rd_df_sample.rename(columns={'description': 'response'}, inplace=True)
rd_df_sample['response'] = rd_df_sample['response'] + "\n### End"
rd_df_sample = rd_df_sample[['prompt', 'response']]

rd_df_sample['text'] = rd_df_sample["prompt"] + rd_df_sample["response"]
rd_df_sample.drop(columns=['prompt', 'response'], inplace=True)

In [8]:
def form_finetuning_dataset(dataset_dict: dict, question_key: str, answer_key: str) -> Dataset:
    instruction_template = """### Generate a regex for this description:
    {question}
    
    ### Answer:"""

    prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

    ### Instruction:
    
    {instruction}
    
    ### Response:\n"""
    
    num_samples = len(dataset_dict[question_key])
    finetuning_dataset_list = []
    for i in range(num_samples):
        question = dataset_dict[question_key][i]
        instruction = instruction_template.format(question=question)
        prompt = prompt_template.format(instruction=instruction)
        response = dataset_dict[answer_key][i] + "\n### End"
        text = prompt + response
        finetuning_dataset_list.append({"instruction": instruction, "response": answer, "text": text})

    finetuning_dataset = Dataset.from_list(finetuning_dataset_list)

    print("One datapoint in the finetuning dataset:")
    pprint(finetuning_dataset[0])
    
    return finetuning_dataset

In [9]:
finetuning_dataset = form_finetuning_dataset(instruction_dataset_dict, question_key = "refined_prompt", answer_key = "expression")

One datapoint in the finetuning dataset:
{'answer': '^\\d$',
 'question': '### Generate a regex for this description:\n'
             '    Matches exactly 1 numeric digit (0-9).\n'
             'Match examples:\n'
             '- "1"\n'
             '- "2"\n'
             '- "3"\n'
             'Non-match examples:\n'
             '- "a"\n'
             '- "324"\n'
             '    \n'
             '    ### Answer:'}


In [25]:
dataset = finetuning_dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)

In [26]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 685
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
})

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
#If only targeting attention blocks of the model
target_modules = ["q_proj", "v_proj"]

#If targeting all linear layers
#target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']

lora_config = LoraConfig(r=16,
                        target_modules = target_modules,
                        lora_alpha=8,
                        lora_dropout=0.05,
                        bias="none",
                        task_type="CAUSAL_LM")

In [None]:
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = 'adamw_hf'
learning_rate = 1e-5
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "linear"

In [None]:
training_args = TrainingArguments(
    output_dir="01-experiment",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    num_train_epochs = 3.0,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [None]:
nf4_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_use_double_quant=True,
  bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Downloading tokenizer_config.json: 100%|██████████| 396/396 [00:00<?, ?B/s] 
Downloading tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 2.93MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 99.0/99.0 [00:00<?, ?B/s]


In [None]:
model = LlamaForCausalLM.from_pretrained(
    model_name, device_map='auto', quantization_config=nf4_config,
)

In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
trainer = SFTTrainer(
    model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field="text",
    max_seq_length=256,
    args=training_args,
    )

In [None]:
#Upcast layer norms to float 32 for stability
for name, module in trainer.model.named_modules():
  if "norm" in name:
    module = module.to(torch.float32)

In [None]:
# Initiate the training process
with mlflow.start_run(run_name='01-LoRA-Experiment'):
    trainer.train()

In [31]:
## Save tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\tokenizer.json')

In [87]:
# Evaluation

def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def exact_match_ratio(predictions, references):
    total_cases = len(predictions)
    correct_cases = sum(1 for pred, ref in zip(predictions, references) if pred == ref)
    return correct_cases / total_cases if total_cases > 0 else 0.0


def calculate_metric_on_test_ds(dataset, model, tokenizer, 
                                batch_size=16, 
                                device=device, 
                                question_key="question", 
                                answer_key="answer"):
                            
    input_batches = list(generate_batch_sized_chunks(dataset[question_key], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[answer_key], batch_size))

    predictions_batches = []
    references_batches = []
    for input_batches, target_batch in tqdm(zip(input_batches, target_batches), total=len(input_batches)):
        
        inputs = tokenizer(input_batches, 
                           max_length=128, 
                           truncation=True, 
                           padding="max_length", 
                           return_tensors="pt")
        
        regex = model.generate(input_ids=inputs["input_ids"].to(device),
                                 attention_mask=inputs["attention_mask"].to(device), 
                                 num_beams=8, 
                                 max_length=512)
        
        decoded_regex = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in regex]      
        
        decoded_regex = [d.replace("", " ") for d in decoded_regex]
        
        predictions_batches.append(decoded_regex[0])
        references_batches.append(target_batch[0])
        

    score = exact_match_ratio(predictions_batches, references_batches)
    return score

In [88]:
score = calculate_metric_on_test_ds(test_dataset[0:16],  
                                    model=trainer.model, 
                                    tokenizer=tokenizer, 
                                    batch_size = 2, 
                                    question_key='question', 
                                    answer_key='answer')


print("Exact matches persentage: ", score)

  0%|          | 0/8 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
 12%|█▎        | 1/8 [00:03<00:22,  3.29s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 25%|██▌       | 2/8 [00:05<00:17,  2.88s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 38%|███▊      | 3/8 [00:08<00:14,  2.81s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 50%|█████     | 4/8 [00:11<00:11,  2

0.0



