# Lab for Why finetuning: compare finetuned vs. non=finetuned open-source models

In [None]:
# !pip install --upgrade --force-reinstall --ignore-installed lamini

In [41]:
from llama import BasicModelRunner
import os

# Hugging Face API key
api_key = "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"
# set environment
os.environ["HUGGINGFACE_HUB_API_TOKEN"] = api_key

In [42]:
non_finetuned = BasicModelRunner("meta-llama/Llama-2-7b-hf")

In [43]:
non_finetuned_output = non_finetuned("Tell me how to train my dog to sit")

AuthenticationError: Missing API Key

# Lab 2: Where finetuning fits in

In [8]:
import jsonlines
import itertools
import pandas as pd
from pprint import pprint

import datasets
from datasets import load_dataset

In [16]:
pretrained_dataset = load_dataset("databricks/databricks-dolly-15k", split="train", streaming=True)

In [11]:
!pip install zstandard

Collecting zstandard
  Downloading zstandard-0.22.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.9 kB)
Downloading zstandard-0.22.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m0:01[0m:01[0m
[?25hInstalling collected packages: zstandard
Successfully installed zstandard-0.22.0


In [17]:
n = 5
print("Pretrained dataset:")
top_n = itertools.islice(pretrained_dataset, n)
for i in top_n:
    print(i)

Pretrained dataset:
{'instruction': 'When did Virgin Australia start operating?', 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.", 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}
{'instruction': 'Which is a species of fish? Tope or Rope', 'context': '', 'response': 'Tope', 'category': 'classification'}
{'instruction': 'Why can camels survive for long without water?', 'context': '', 'response': 'Camels use the fat in their

In [22]:
instruction_dataset_df = pd.read_parquet("hf://datasets/kotzeje/lamini_docs.jsonl/data/train-00000-of-00001-6359aa989b671345.parquet")
# instruction_dataset_df.head()
instruction_dataset_df

Unnamed: 0,question,answer
0,How can I evaluate the performance and quality...,There are several metrics that can be used to ...
1,Can I find information about the code's approa...,"Yes, the code includes methods for submitting ..."
2,How does Lamini AI handle requests for generat...,Lamini AI offers features for generating text ...
3,Does the `submit_job()` function expose any ad...,It is unclear which `submit_job()` function is...
4,Does the `add_data()` function support differe...,"No, the `add_data()` function does not support..."
...,...,...
1395,Does Lamini have the ability to understand and...,"Yes, Lamini has the ability to understand and ..."
1396,Can I fine-tune the pre-trained models provide...,"Yes, you can fine-tune the pre-trained models ..."
1397,Can Lamini generate text that is suitable for ...,"Yes, Lamini can generate text that is suitable..."
1398,Does the documentation have a secret code that...,I wish! This documentation only talks about La...


In [24]:
examples = instruction_dataset_df.to_dict()
text = examples["question"][0] + examples["answer"][0]
text

"How can I evaluate the performance and quality of the generated text from Lamini models?There are several metrics that can be used to evaluate the performance and quality of generated text from Lamini models, including perplexity, BLEU score, and human evaluation. Perplexity measures how well the model predicts the next word in a sequence, while BLEU score measures the similarity between the generated text and a reference text. Human evaluation involves having human judges rate the quality of the generated text based on factors such as coherence, fluency, and relevance. It is recommended to use a combination of these metrics for a comprehensive evaluation of the model's performance."

In [25]:
if "question" in examples and "answer" in examples:
    text = examples["question"][0] + examples["answer"][0]
elif "instruction" in examples and "response" in examples:
    text = examples["instruction"][0] + examples["response"][0]
elif "input" in examples and "output" in examples:
    text = examples["input"][0] + examples["output"][0]
else:
    text = examples["text"][0]

In [44]:
prompt_template_qa = """### Question:
{question}

### Answer:
{answer}"""

In [45]:
question = examples["question"][0]
answer = examples["answer"][0]

text_with_prompt_template = prompt_template_qa.format(question=question, answer=answer)
text_with_prompt_template

"### Question:\nHow can I evaluate the performance and quality of the generated text from Lamini models?\n\n### Answer:\nThere are several metrics that can be used to evaluate the performance and quality of generated text from Lamini models, including perplexity, BLEU score, and human evaluation. Perplexity measures how well the model predicts the next word in a sequence, while BLEU score measures the similarity between the generated text and a reference text. Human evaluation involves having human judges rate the quality of the generated text based on factors such as coherence, fluency, and relevance. It is recommended to use a combination of these metrics for a comprehensive evaluation of the model's performance."

In [46]:
prompt_template_q = """### Question:
{question}

### Answer:"""

In [47]:
num_examples = len(examples["question"])
finetuning_dataset_text_only = []
finetuning_dataset_question_answer = []
for i in range(num_examples):
    question = examples["question"][i]
    answer = examples["answer"][i]

    text_with_prompt_template_qa = prompt_template_qa.format(question=question, answer=answer)
    finetuning_dataset_text_only.append({"text": text_with_prompt_template})

    text_with_prompt_template_q = prompt_template_q.format(question=question, answer=answer)
    finetuning_dataset_question_answer.append({"question": text_with_prompt_template_q})

In [49]:
pprint(finetuning_dataset_text_only[0])

{'text': '### Question:\n'
         'How can I evaluate the performance and quality of the generated text '
         'from Lamini models?\n'
         '\n'
         '### Answer:\n'
         'There are several metrics that can be used to evaluate the '
         'performance and quality of generated text from Lamini models, '
         'including perplexity, BLEU score, and human evaluation. Perplexity '
         'measures how well the model predicts the next word in a sequence, '
         'while BLEU score measures the similarity between the generated text '
         'and a reference text. Human evaluation involves having human judges '
         'rate the quality of the generated text based on factors such as '
         'coherence, fluency, and relevance. It is recommended to use a '
         'combination of these metrics for a comprehensive evaluation of the '
         "model's performance."}


In [50]:
pprint(finetuning_dataset_question_answer[0])

{'question': '### Question:\n'
             'How can I evaluate the performance and quality of the generated '
             'text from Lamini models?\n'
             '\n'
             '### Answer:'}


In [51]:
with jsonlines.open(f"lamini_docs_processed.jsonl", "w") as writer:
    writer.write_all(finetuning_dataset_question_answer)

In [36]:
finetuning_dataset_name = "lamini/lamini_docs"
finetuning_dataset = load_dataset(finetuning_dataset_name)
print(finetuning_dataset)

Downloading readme:   0%|          | 0.00/577 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/615k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/83.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1260 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/140 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})


# Lab 3: Instruction tuning lab

In [52]:
import itertools
import jsonlines

from datasets import load_dataset
from pprint import pprint

from llama import BasicModelRunner
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [53]:
instruction_tuned_dataset = load_dataset("tatsu-lab/alpaca", split="train", streaming=True) # streaming为True表示流式加载

Downloading readme:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

In [57]:
m = 5
print("Pretrained dataset:")
top_m = itertools.islice(instruction_tuned_dataset, m)
for i in top_m:
    print(i)

Pretrained dataset:
{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}
{'instruction': 'What are the three primary colors?', 'input': '', 'output': 'The three primary colors are red, blue, and yellow.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat are the three primary 

In [58]:
promt_template_with_input = """Below is an instruction that describes a task, paired with an input that provides further controll

### Instruction:
{instruction}

### Input:
{input}

### Response:"""

prompt_template_without_input = """Below is an instruction that describes a task, Write a response that appropriately completes

### Instruction:
{instruction}

### Response:"""

In [62]:
processed_data = []
for j in top_m:
    if not j["input"]:
        processed_prompt = prompt_template_without_input.format(instruction=j["instruction"])
    else:
        processed_prompt = prompt_template_with_input.format(instruction=j["instruction"])
        
    processed_data.append({"input":processed_prompt, "output":j["output"]})
    

In [64]:
len(processed_data)

0

In [65]:
non_instruct_model = BasicModelRunner("meta-llama/Llama-2-7b-hf")

In [66]:
non_instruct_output = non_instruct_model("Tell me how to train my dog to sit",)

AuthenticationError: Missing API Key

In [67]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m")

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]

In [68]:
def interence(text, model, tokenizer, max_input_tokens=1000, max_output_token):
    # Tokenize
    input_ids = tokenizer.encode(
        text,
        return_tensor="pt",
        truncation=True,
        max_length=max_input_tokens
    )

    # Generate
    device = model.device
    generated_tokens_with_prompt = model.generate(
        input_ids=input_ids.to(device),
        max_length=max_output_tokens
    )

    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt)

    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(text):]

    return generated_text_answer

SyntaxError: non-default argument follows default argument (1831395828.py, line 1)