# A8 Alpaca Instuction Tuning Evalutation

In [None]:
# !pip install accelerate

In [None]:
# !pip install trl

## Instruction-Tuning

In [3]:
file_path = "alpaca_data.json"

### Step 1: Load the dataset

In [None]:
# !pip install datasets

In [4]:
# Step 1: Load the dataset
from datasets import Dataset

In [5]:
# Load the JSON file into a dataset
dataset = Dataset.from_json(file_path)
# dataset = Dataset.from_json(file_path, format="json")

dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 52002
})

In [31]:
dataset[0]

{'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.',
 'input': '',
 'instruction': 'Give three tips for staying healthy.'}

In [32]:
# dataset.select(range(100))

### Step 2: Load the model & Tokenizer

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name_or_path = "distilgpt2"
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path, device_map = 'auto'
)
tokenizer           = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

In [7]:
# set instruction
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text     = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

# check instruction-prompt
formatting_prompts_func(dataset[:2])

['### Question: Give three tips for staying healthy.\n ### Answer: 1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.',
 '### Question: What are the three primary colors?\n ### Answer: The three primary colors are red, blue, and yellow.']

In [8]:
# use the DataCollatorForCompletionOnlyLM to train your model on the generated prompts only
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
response_template = "### Answer:"
collator          = DataCollatorForCompletionOnlyLM(response_template, tokenizer = tokenizer)
collator

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


DataCollatorForCompletionOnlyLM(tokenizer=GPT2TokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [10]:
# make sure to pass a correct value for max_seq_len as the default value will be set to min (tokenizer.modle_max_lenght, 1024)
max_seq_length = min(tokenizer.model_max_length, 1024)
max_seq_length

1024

### Task 3: Evaluation

In [12]:
# Step 1: Load the dataset
from datasets import load_dataset
eval_dataset = load_dataset("tatsu-lab/alpaca_eval", split='eval')
eval_dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Dataset({
    features: ['instruction', 'output', 'generator', 'dataset'],
    num_rows: 805
})

### training

In [None]:
max_seq_length = min(tokenizer.model_max_length, 1024)

training_args = TrainingArguments(
                    save_strategy = 'epoch',
                    evaluation_strategy = 'epoch',
                    gradient_checkpointing = True,
                    per_device_train_batch_size = 2,
                    per_device_eval_batch_size = 2,
                    output_dir       ='tmp_trainer', # default
                    num_train_epochs = 3,
                )

trainer = SFTTrainer(
    model,
    args = training_args,
    train_dataset   = dataset.select(range(100)),
    eval_dataset    = eval_dataset.select(range(50)),
    formatting_func = formatting_prompts_func,
    data_collator   = collator,
    max_seq_length  = max_seq_length,
)

trainer.train()

### Save the model

In [None]:
save_path = "./app/modle"
trainer.save_model(save_path)

### Evaluate the model

In [None]:
trainer.evaluate()

In [None]:
# Encode input text
input_text = eval_dataset[0]['instruction']
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Move input tensor to the same device as the model
input_ids = input_ids.to(model.device)

# Generate output
output = model.generate(input_ids, max_length=256, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated text:\n", generated_text)