In [42]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines

#from utilities import *
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from llama import BasicModelRunner
from datasets import Dataset
from datasets import load_dataset
from pprint import pprint
from tqdm import tqdm
logger = logging.getLogger(__name__)
global_config = None

In [2]:
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

In [3]:
dataset = load_dataset('s2e-lab/RegexEval', split='train')

In [4]:
#model_name = "microsoft/phi-1_5"
model_name = "EleutherAI/pythia-70m"
base_model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
base_model.to(device)

Downloading config.json: 100%|██████████| 567/567 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading model.safetensors: 100%|██████████| 166M/166M [00:24<00:00, 6.69MB/s] 


GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [5]:
#torch.set_default_device("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

Downloading tokenizer_config.json: 100%|██████████| 396/396 [00:00<?, ?B/s] 
Downloading tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 2.93MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 99.0/99.0 [00:00<?, ?B/s]


In [6]:
import pandas as pd

instruction_dataset_df = pd.DataFrame(dataset)
instruction_dataset_dict = instruction_dataset_df.to_dict()

In [7]:
instruction_dataset_df.head(2)

Unnamed: 0,expression,raw_prompt,refined_prompt,matches,non_matches,id
0,^\d$,Matches exactly 1 numeric digit (0-9).,Matches exactly 1 numeric digit (0-9).\nMatch ...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 0]","[a, 324, num, location = 3, ssda, 11, hello wo...",1
1,^\d{5}$,"Matches 5 numeric digits, such as a zip code.","Matches 5 numeric digits, such as a zip code.\...","[33333, 55555, 23445, 89343, 46556, 25432, 253...","[abcd, 1324, as;lkjdf, jaldks, 234, 8hr4f, fsd...",2


In [22]:
instruction_dataset_df['refined_prompt'].apply(lambda x: len(x.split())).max()

144

In [8]:
def form_finetuning_dataset(dataset_dict: dict, question_key: str, answer_key: str) -> Dataset:
    prompt_template = """### Generate a regex for this description:
    {question}
    
    ### Answer:"""
    
    num_samples = len(dataset_dict[question_key])
    finetuning_dataset_list = []
    for i in range(num_samples):
      question = dataset_dict[question_key][i]
      answer = dataset_dict[answer_key][i]
      text_with_prompt_template = prompt_template.format(question=question)
      finetuning_dataset_list.append({"question": text_with_prompt_template, "answer": answer})

    finetuning_dataset = Dataset.from_list(finetuning_dataset_list)

    print("One datapoint in the finetuning dataset:")
    pprint(finetuning_dataset[0])
    
    return finetuning_dataset

In [9]:
finetuning_dataset = form_finetuning_dataset(instruction_dataset_dict, question_key = "refined_prompt", answer_key = "expression")

One datapoint in the finetuning dataset:
{'answer': '^\\d$',
 'question': '### Generate a regex for this description:\n'
             '    Matches exactly 1 numeric digit (0-9).\n'
             'Match examples:\n'
             '- "1"\n'
             '- "2"\n'
             '- "3"\n'
             'Non-match examples:\n'
             '- "a"\n'
             '- "324"\n'
             '    \n'
             '    ### Answer:'}


In [23]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['question'] , max_length = 128, truncation = True, padding='max_length')
    
    target_encodings = tokenizer(example_batch['answer'], max_length = 128, truncation = True, padding='max_length')
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }
    

In [24]:
tokenized_dataset = finetuning_dataset.map(convert_examples_to_features, batched=True)

Map: 100%|██████████| 762/762 [00:00<00:00, 3734.51 examples/s]


In [25]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)

In [26]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 685
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
})

In [27]:
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

In [28]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=base_model, padding=True)

In [29]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="test-trainer", num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
) 

In [30]:
from transformers import Trainer

trainer = Trainer(
    base_model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=42, training_loss=6.863502139136905, metrics={'train_runtime': 473.2012, 'train_samples_per_second': 1.448, 'train_steps_per_second': 0.089, 'total_flos': 23054512029696.0, 'train_loss': 6.863502139136905, 'epoch': 0.98})

In [32]:
## Save model
base_model.save_pretrained("pythia-tuned-model")

In [31]:
## Save tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\tokenizer.json')

In [87]:
# Evaluation

def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def exact_match_ratio(predictions, references):
    total_cases = len(predictions)
    correct_cases = sum(1 for pred, ref in zip(predictions, references) if pred == ref)
    return correct_cases / total_cases if total_cases > 0 else 0.0


def calculate_metric_on_test_ds(dataset, model, tokenizer, 
                                batch_size=16, 
                                device=device, 
                                question_key="question", 
                                answer_key="answer"):
                            
    input_batches = list(generate_batch_sized_chunks(dataset[question_key], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[answer_key], batch_size))

    predictions_batches = []
    references_batches = []
    for input_batches, target_batch in tqdm(zip(input_batches, target_batches), total=len(input_batches)):
        
        inputs = tokenizer(input_batches, 
                           max_length=128, 
                           truncation=True, 
                           padding="max_length", 
                           return_tensors="pt")
        
        regex = model.generate(input_ids=inputs["input_ids"].to(device),
                                 attention_mask=inputs["attention_mask"].to(device), 
                                 num_beams=8, 
                                 max_length=512)
        
        decoded_regex = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in regex]      
        
        decoded_regex = [d.replace("", " ") for d in decoded_regex]
        
        predictions_batches.append(decoded_regex[0])
        references_batches.append(target_batch[0])
        

    score = exact_match_ratio(predictions_batches, references_batches)
    return score

In [88]:
score = calculate_metric_on_test_ds(test_dataset[0:16],  
                                    model=trainer.model, 
                                    tokenizer=tokenizer, 
                                    batch_size = 2, 
                                    question_key='question', 
                                    answer_key='answer')


print("Exact matches persentage: ", score)

  0%|          | 0/8 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
 12%|█▎        | 1/8 [00:03<00:22,  3.29s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 25%|██▌       | 2/8 [00:05<00:17,  2.88s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 38%|███▊      | 3/8 [00:08<00:14,  2.81s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 50%|█████     | 4/8 [00:11<00:11,  2

0.0



