# Setup the Environment

## Install Python Packages

In [None]:
!pip install --quiet --upgrade accelerate peft bitsandbytes trl evaluate rouge_score

# Complete package list:
# pip install --upgrade accelerate peft bitsandbytes trl evaluate rouge_score torch transformers datasets tqdm tensorboard pandas

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/290.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m286.7/290.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.0/225.0 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m 

## Install Some Useful Process Viewer

- `nvtop` GPU process viewer
- `htop` CPU process viewer

In [None]:
!apt install nvtop htop

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libnl-genl-3-200
Suggested packages:
  lm-sensors strace
The following NEW packages will be installed:
  htop libnl-genl-3-200 nvtop
0 upgraded, 3 newly installed, 0 to remove and 39 not upgraded.
Need to get 184 kB of archives.
After this operation, 511 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libnl-genl-3-200 amd64 3.5.0-0.1 [12.4 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 htop amd64 3.0.5-7build2 [128 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/multiverse amd64 nvtop amd64 1.2.2-1 [43.9 kB]
Fetched 184 kB in 1s (325 kB/s)
Selecting previously unselected package libnl-genl-3-200:amd64.
(Reading database ... 121753 files and directories currently installed.)
Preparing to unpack .../libnl-genl-3-200_3.5.0-0.1_amd64.deb ...
Unpacking libnl-genl-3-200:amd64 (3.

## Log into Hugging Face

- (Recommended) You can add your Hugging Face token to the Secrets tab on the left panel with the name "HF_TOKEN". This will allow authentication to pass automatically every time you need it.

- Otherwise, you need to use notebook_login to log in manually:
```python
from huggingface_hub import notebook_login
notebook_login()
```

- Alternatively, use the function `login(userdata.get('TOKEN_NAME'))` to automatically log in:
```python
from google.colab import userdata
from huggingface_hub import login
login(userdata.get('TOKEN_NAME'))
```

## Set Up the Cache Directory for Hugging Face

There are two options:

1. Use the environment variable `HF_DATASETS_CACHE`
    ```shell
    $ export HF_DATASETS_CACHE="/path/to/another/directory"
    ```

    This will work for every command that requires `cache_dir`.

2. (Recommended) Programmatically set the cache_dir when needed.

    For example, we use Google Drive as our persistent storage to cache the downloaded models, so we don't need to download them every time we restart or reload the notebook.

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')
colab_dir = '/content/drive/MyDrive/Colab Notebooks/'
cache_dir = os.path.join(colab_dir, 'cache')

Mounted at /content/drive


# Create the Dataset

## Import Necessary Packages for This Section

In [None]:
# Built-in packages
import json
import os
from typing import Any, Dict, List, Union

# 3rd party packages
import pandas as pd
from datasets import load_dataset

dataset_path = os.path.join(colab_dir, 'dataset.csv')

## Upload Dataset to Your Drive

Download link: [dataset.csv](https://drive.google.com/file/d/1tjyWdtL5wTGvhv55C-qgoTkpDyWy19Y6/view?usp=sharing).

## Let's Glance the Dataset.

In [None]:
df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,instruction,input,output
0,Give three tips for staying healthy.,,1. Eat a balanced and nutritious diet: Make su...
1,What are the three primary colors?,,"The three primary colors are red, blue, and ye..."
2,Describe the structure of an atom.,,An atom is the basic building block of all mat...
3,How can we reduce air pollution?,,There are several ways to reduce air pollution...
4,Describe a time when you had to make a difficu...,,"As an AI assistant, I do not have my own perso..."


## Load Dataset from Disk

In [None]:
dataset = load_dataset("csv", data_files=dataset_path)
print(dataset)

print(dataset['train'][0])

print(dataset['train'][:2])

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 52002
    })
})
{'instruction': 'Give three tips for staying healthy.', 'input': None, 'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.'}
{'instruction': ['Give three tips 

## Filtering

### Basic Usage

In [None]:
dataset = load_dataset("csv", data_files=dataset_path, split='train')
print(dataset)
dataset = dataset.filter(
    lambda e: len(e['instruction']) < 64,                   # Keep entries shorter than 64 characters.
    num_proc=2,                                             # Enable multiprocessing.
)
print(dataset)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 52002
})


Filter (num_proc=2):   0%|          | 0/52002 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 33565
})


### Batch Process

In [None]:
def batch_filter(examples: Dict[str, List]) -> List[bool]:
    keep = []
    for instruction in examples['instruction']:
        if len(instruction) < 64:
            keep.append(True)
        else:
            keep.append(False)
    return keep


dataset = load_dataset("csv", data_files=dataset_path, split='train')
print(dataset)
# doc: https://huggingface.co/docs/datasets/v2.18.0/en/package_reference/main_classes#datasets.Dataset.filter.function
dataset = dataset.filter(
    batch_filter,
    batched=True,
    batch_size=2048,
    num_proc=2,                                             # Enable multiprocessing.
)
print(dataset)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 52002
})


Filter (num_proc=2):   0%|          | 0/52002 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 33565
})


## Mapping

### Basic Usage

In [None]:
template = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n"
    "{instruction}\n\n"
    "### Response:\n"
    "{output}"
)

def formatter(example: Dict[str, Any]) -> Dict[str, Any]:
    example['text'] = template.format(**example)
    return example

dataset = load_dataset("csv", data_files=dataset_path, split='train')
print(dataset)
dataset = dataset.map(formatter, num_proc=2)
print(dataset)

print("\n" + "-" * 40 + "\n")
for key, value in dataset[0].items():
    print(f"[{key}]")
    print(value)
    print("\n" + "-" * 40 + "\n")

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 52002
})


Map (num_proc=2):   0%|          | 0/52002 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 52002
})

----------------------------------------

[instruction]
Give three tips for staying healthy.

----------------------------------------

[input]
None

----------------------------------------

[output]
1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.

2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.

3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune f

### <font color='#EC7063'>TODO 1:</font> Batch Process

In [None]:
template = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n"
    "{instruction}\n\n"
    "### Response:\n"
    "{output}"
)

def formatter(examples: Dict[str, List]) -> Dict[str, List]:
    text = []
    for i in range(len(examples['instruction'])):
        text.append(template.format(
            instruction=examples['instruction'][i],
            output=examples['output'][i],
        ))
    examples['text'] = text
    return examples

dataset = load_dataset("csv", data_files=dataset_path, split='train')
print(dataset)
# doc: https://huggingface.co/docs/datasets/v2.18.0/en/package_reference/main_classes#datasets.Dataset.map
# TODO
dataset = dataset.map(
    formatter,
    batched=True,
    batch_size=2048,
    num_proc=2,                                             # Enable multiprocessing.
)
print(dataset)

print("\n" + "-" * 40 + "\n")
for key, value in dataset[0].items():
    print(f"[{key}]")
    print(value)
    print("\n" + "-" * 40 + "\n")

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 52002
})


Map (num_proc=2):   0%|          | 0/52002 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 52002
})

----------------------------------------

[instruction]
Give three tips for staying healthy.

----------------------------------------

[input]
None

----------------------------------------

[output]
1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.

2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.

3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune f

# Showcase the Output of Raw LLaMA2

## Import Necessary Packages for This Section

In [None]:
# Built-in packages
import os
from typing import Dict, List, Union

# 3rd party packages
import evaluate
import torch
from datasets import load_dataset
from tqdm.notebook import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GenerationConfig,
)

device = torch.device('cuda:0')
colab_dir = '/content/drive/MyDrive/Colab Notebooks/'
cache_dir = os.path.join(colab_dir, 'cache')
dataset_path = os.path.join(colab_dir, 'dataset.csv')

print(f'Cache directory: {cache_dir}')

Cache directory: /content/drive/MyDrive/Colab Notebooks/cache


## Prepare the Test Dataset

In [None]:
template = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n"
    "{instruction}\n\n"
    "### Response:\n"
    "{output}"
)


def formatting_test(example):
    example['text'] = template.format(instruction=example['instruction'], output="")
    return example


dataset = load_dataset("csv", data_files=dataset_path, split='train')
print("[Raw Dataset]")
print(dataset)
print("-" * 40)

print("[Filtered Dataset]")
dataset = dataset.filter(
    lambda e: e['input'] is None or e['input'] == "",
    num_proc=2,                                             # multiprocessing
)
print(dataset)
print("-" * 40)

print("[Splitted Dataset]")
dataset = dataset.train_test_split(test_size=0.1, seed=0)   # set seed for reproducibility
print(dataset)
print("-" * 40)

print("[Test Subset]")
test_subset_indices = torch.randperm(
    len(dataset['test']),
    generator=torch.Generator().manual_seed(0),             # set seed for reproducibility
)[:100]
dataset_test_subset = dataset['test'].select(test_subset_indices)
dataset_test_subset = dataset_test_subset.map(formatting_test, num_proc=2)
print(dataset_test_subset)
print("-" * 40)

[Raw Dataset]
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 52002
})
----------------------------------------
[Filtered Dataset]


Filter (num_proc=2):   0%|          | 0/52002 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 31338
})
----------------------------------------
[Splitted Dataset]
DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 28204
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 3134
    })
})
----------------------------------------
[Test Subset]


Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 100
})
----------------------------------------


## Load LLaMA2 Model

In [None]:
model_name = "meta-llama/Llama-2-7b-hf"

torch.cuda.reset_peak_memory_stats()

# doc: https://huggingface.co/docs/transformers/main_classes/quantization#transformers.BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# doc: https://huggingface.co/docs/transformers/v4.38.2/en/model_doc/auto#transformers.AutoModelForCausalLM.from_pretrained
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    trust_remote_code=True,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map={"": 0},
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token                   # The pretrained tokenizer for llama2 lacks of padding token
print(f"tokenizer.add_bos_token: {tokenizer.add_bos_token}")
print(f"tokenizer.add_eos_token: {tokenizer.add_eos_token}")

# doc: https://pytorch.org/docs/stable/generated/torch.cuda.max_memory_allocated.html#torch.cuda.max_memory_allocated
peak_memory = torch.cuda.max_memory_allocated(device)
peak_memory = peak_memory / 1024**3                         # Convert Byte to Gigabyte
print(f"Memory allocated: {peak_memory:.3}GB")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.add_bos_token: True
tokenizer.add_eos_token: False
Memory allocated: 3.73GB


## <font color='#EC7063'>TODO 2:</font> Inference

In [None]:
def inference(model, tokenizer, instruction: str) -> str:
    example = {'instruction': instruction}
    inputs = formatting_test(example)['text']

    inputs = tokenizer(
        inputs,
        return_tensors="pt",
    )
    inputs = inputs.to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
    )
    output = tokenizer.decode(
        outputs[0],
        skip_special_tokens=True,
    )
    return output

## Test the Inference function with Data from the Test Subset

In [None]:
instruction = dataset_test_subset[0]['instruction']
reference = dataset_test_subset[0]['output']
print("[Instruction]")
print(instruction)
print("")
print("[Reference]")
print(reference)
print("")
output = inference(model, tokenizer, instruction)
print("[Output]")
print(output)

[Instruction]
List the top 5 tourist sites in Rome

[Reference]
Here are the top 5 tourist sites in Rome:

1. Colosseum: This iconic symbol of Rome is a must-visit for tourists. This ancient amphitheater was built to host gladiatorial contests and other public spectacles.

2. Vatican Museums: The Vatican Museums are located within Vatican City, and they house one of the largest and most impressive art collections in the world, including the famous Sistine Chapel.

3. Pantheon: The Pantheon is an ancient temple dedicated to all the gods of Rome. It’s an architectural marvel, with its perfectly proportioned dome and oculus.

4. Trevi Fountain: The Trevi Fountain is one of the most famous fountains in the world, known for its Baroque style and the tradition of throwing coins into the fountain for good luck.

5. Roman Forum: The Roman Forum was the center of political and social activity in ancient Rome, and its ruins are a testimony to the grandeur of the Roman Empire. Today, tourists can

# Start to Fine-tune LLaMA2

## Import Necessary Packages for This Section

In [None]:
# Built-in packages
import os
import warnings

# 3rd party packages
import evaluate
import torch
from datasets import load_dataset
from google.colab import userdata
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from tqdm.notebook import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GenerationConfig,
    TrainingArguments,
)
from trl import SFTTrainer

device = torch.device('cuda:0')
colab_dir = '/content/drive/MyDrive/Colab Notebooks/'
cache_dir = os.path.join(colab_dir, 'cache')
dataset_path = os.path.join(colab_dir, 'dataset.csv')

# Disable FutureWarnings from `accelerate`.
warnings.simplefilter(action='ignore', category=FutureWarning)

print(f'Cache directory: {cache_dir}')

Cache directory: /content/drive/MyDrive/Colab Notebooks/cache


## Prepare the Train Dataset

In [None]:
template = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n"
    "{instruction}\n\n"
    "### Response:\n"
    "{output}"
)


def formatting_train(example):
    example['text'] = template.format(**example)
    return example


dataset = load_dataset("csv", data_files=dataset_path, split='train')
print("[Raw Dataset]")
print(dataset)
print("-" * 40)

print("[Filtered Dataset]")
dataset = dataset.filter(
    lambda e: e['input'] is None or e['input'] == "",
    num_proc=2,                                             # multiprocessing
)
print(dataset)
print("-" * 40)

print("[Splitted Dataset]")
dataset = dataset.train_test_split(test_size=0.1, seed=0)   # set seed for reproducibility
print(dataset)
print("-" * 40)

print("[Train Subset]")
train_subset_indices = torch.randperm(
    len(dataset['train']),
    generator=torch.Generator().manual_seed(0),             # set seed for reproducibility
)[:1000]
dataset_train_subset = dataset['train'].select(train_subset_indices)
dataset_train_subset = dataset_train_subset.map(formatting_train, num_proc=2)
print(dataset_train_subset)
print("-" * 40)

[Raw Dataset]
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 52002
})
----------------------------------------
[Filtered Dataset]
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 31338
})
----------------------------------------
[Splitted Dataset]
DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 28204
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 3134
    })
})
----------------------------------------
[Train Subset]
Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 1000
})
----------------------------------------


## Load LLaMA2 in 4-bit Format

### There are several types of memory used in the training stage
- Model parameters
- Model gradients
- Forward propagation
    - Propagation results
    - Forward cache for calculating gradient
- Backward propagation

### Comparison of fp32, fp16, bf16
![bf16](https://blogs.nvidia.com/wp-content/uploads/2020/05/tf32-Mantissa-chart-hi-res-FINAL.png)

Reference: [NVIDIA Blog](https://blogs.nvidia.com/blog/tensorfloat-32-precision-format/)

### QLoRA
- 4bit NormalFloat (nf4)
- Double Quantization
- Paged Optimizers

Reference: [QLORA: Efficient Finetuning of Quantized LLMs](https://openreview.net/pdf?id=OUIFPHEgJU)

In [None]:
model_name = "meta-llama/Llama-2-7b-hf"

torch.cuda.reset_peak_memory_stats()

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map={"": 0},
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Prepare Model for LoRA Training

![lora](https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5dfbd169-eb7e-41e1-a050-556ccd6fb679_1600x672.png)

Reference: [blog](https://magazine.sebastianraschka.com/p/practical-tips-for-finetuning-llms)

In [None]:
# doc: https://huggingface.co/docs/peft/package_reference/peft_model#peft.prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(
    model,
    use_gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": True}   # silence warning
)
# doc: https://huggingface.co/docs/peft/package_reference/lora#peft.LoraConfig
lora_config = LoraConfig(
    r=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj"],
    lora_alpha=16,
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.config.use_cache = False                              # silence the warnings. Please re-enable for inference!

peak_memory = torch.cuda.max_memory_allocated(device)       # https://pytorch.org/docs/stable/generated/torch.cuda.max_memory_allocated.html#torch.cuda.max_memory_allocated
peak_memory = peak_memory / 1024**3                         # Convert Byte to Gigabyte
print(f"Max memory allocated: {peak_memory:.3}GB")

Max memory allocated: 4.46GB


## Prepare LLaMA2 Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token                   # The pretrained tokenizer for llama2 lacks of padding token
tokenizer.add_eos_token = True                              # Add eos token to the end of text is important for training
print(f"tokenizer.add_bos_token: {tokenizer.add_bos_token}")
print(f"tokenizer.add_eos_token: {tokenizer.add_eos_token}")

tokenizer.add_bos_token: True
tokenizer.add_eos_token: True


## Training Configuration

In [None]:
training_arguments = TrainingArguments(
    output_dir='./results',
    max_grad_norm=0.3,
    warmup_ratio=0.3,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,

    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="linear",

    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),

    logging_steps=30,
    save_total_limit=5,
    save_strategy="steps",
    save_steps=100,
)

## Trainer configuration

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train_subset,
    tokenizer=tokenizer,
    args=training_arguments,

    # doc: https://huggingface.co/docs/trl/sft_trainer#trl.SFTTrainer.packing
    packing=True,
    dataset_text_field="text",
    max_seq_length=1024,
)

Generating train split: 0 examples [00:00, ? examples/s]

## Start Training

In [None]:
trainer.train()
trainer.save_model(os.path.join(colab_dir, "checkpoint-last"))

Step,Training Loss
30,1.3196
60,1.0367
90,0.914
120,0.9153
150,0.9089
180,0.9002
210,0.9127
240,0.8743


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

# Load Fine-tunned Model from Disk and Evaluate It

## Import Necessary Packages for This Section

In [None]:
# Built-in packages
import os

# 3rd party packages
import torch
from datasets import load_dataset
from google.colab import userdata
from peft import PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GenerationConfig,
)

device = torch.device('cuda:0')
colab_dir = '/content/drive/MyDrive/Colab Notebooks/'
cache_dir = os.path.join(colab_dir, 'cache')
dataset_path = os.path.join(colab_dir, 'dataset.csv')

## Prepare the Test Dataset

In [None]:
template = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n"
    "{instruction}\n\n"
    "### Response:\n"
    "{output}"
)


def formatting_test(example):
    example['text'] = template.format(instruction=example['instruction'], output="")
    return example


dataset = load_dataset("csv", data_files=dataset_path, split='train')
print("[Raw Dataset]")
print(dataset)
print("-" * 40)

print("[Filtered Dataset]")
dataset = dataset.filter(
    lambda e: e['input'] is None or e['input'] == "",
    num_proc=2,                                             # multiprocessing
)
print(dataset)
print("-" * 40)

print("[Splitted Dataset]")
dataset = dataset.train_test_split(test_size=0.1, seed=0)   # set seed for reproducibility
print(dataset)
print("-" * 40)

print("[Test Subset]")
test_subset_indices = torch.randperm(
    len(dataset['test']),
    generator=torch.Generator().manual_seed(0),             # set seed for reproducibility
)[:100]
dataset_test_subset = dataset['test'].select(test_subset_indices)
dataset_test_subset = dataset_test_subset.map(formatting_test, num_proc=2)
print(dataset_test_subset)
print("-" * 40)

[Raw Dataset]
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 52002
})
----------------------------------------
[Filtered Dataset]
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 31338
})
----------------------------------------
[Splitted Dataset]
DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 28204
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 3134
    })
})
----------------------------------------
[Test Subset]
Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 100
})
----------------------------------------


## Load LLaMA2 in 4bit Format

In [None]:
model_name = "meta-llama/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    trust_remote_code=True,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map={"": 0},
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token                   # The pretrained tokenizer for llama2 lacks of padding token
print(f"tokenizer.add_bos_token: {tokenizer.add_bos_token}")
print(f"tokenizer.add_eos_token: {tokenizer.add_eos_token}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.add_bos_token: True
tokenizer.add_eos_token: False


## Load LoRA Weight from Checkpoint

In [None]:
model = PeftModel.from_pretrained(
    model,
    os.path.join(colab_dir, "checkpoint-last"),
    device_map={"": 0},
)
model.eval()
peak_memory = torch.cuda.max_memory_allocated(device)
peak_memory = peak_memory / 1024**3     # Convert Byte to Gigabyte
print(f"Memory allocated: {peak_memory:.3}GB")

Memory allocated: 4.21GB


## Showcase the Output of Fine-tuned LLaMA2

In [None]:
def inference(model, tokenizer, instruction: str) -> str:
    example = {'instruction': instruction}
    inputs = formatting_test(example)['text']

    inputs = tokenizer(
        inputs,
        return_tensors="pt",
    )
    inputs = inputs.to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
    )
    output = tokenizer.decode(
        outputs[0],
        skip_special_tokens=True,
    )
    return output


instruction = dataset_test_subset[0]['instruction']
reference = dataset_test_subset[0]['output']
print("[Instruction]")
print(instruction)
print("")
print("[Reference]")
print(reference)
print("")
output = inference(model, tokenizer, instruction)
print("[Output]")
print(output)

[Instruction]
List the top 5 tourist sites in Rome

[Reference]
Here are the top 5 tourist sites in Rome:

1. Colosseum: This iconic symbol of Rome is a must-visit for tourists. This ancient amphitheater was built to host gladiatorial contests and other public spectacles.

2. Vatican Museums: The Vatican Museums are located within Vatican City, and they house one of the largest and most impressive art collections in the world, including the famous Sistine Chapel.

3. Pantheon: The Pantheon is an ancient temple dedicated to all the gods of Rome. It’s an architectural marvel, with its perfectly proportioned dome and oculus.

4. Trevi Fountain: The Trevi Fountain is one of the most famous fountains in the world, known for its Baroque style and the tradition of throwing coins into the fountain for good luck.

5. Roman Forum: The Roman Forum was the center of political and social activity in ancient Rome, and its ruins are a testimony to the grandeur of the Roman Empire. Today, tourists can

# Quantitatively Evaluate the Generated Outputs using ROUGE Score

### General Example of ROUGE score (Single Output and Multiple References):

- Generated Sentence (G1):
```
police ended the gunman.
```
- Reference Sentences:

    1. R1
    ```
    Police killed the gunman.
    ```
    2. R2
    ```
    The gunman was shot down by police.
    ```

- ROUGE-1 score is:

   $$
   \frac{{3 + 3}}{{4 + 7}} = 0.\overline{54}
   $$

    - First 3 (G1 in R1): `police`, `the`, and `gunman`.
    
    - Second 3 (G1 in R2): `the`, `gunman`, and `police`.
    
    - 4: the length of R1
    - 7: the length of R2

**Note: In our test subset, there is only one reference for each input.**

## Import Necessary Packages for This Section

In [None]:
# Built-in packages
import os
from typing import Dict, List, Union, Tuple

# 3rd party packages
import evaluate
import torch
from datasets import load_dataset
from peft import PeftModel
from tqdm.notebook import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GenerationConfig,
    StoppingCriteriaList,
    StoppingCriteria,
)

device = torch.device('cuda:0')
colab_dir = '/content/drive/MyDrive/Colab Notebooks/'
cache_dir = os.path.join(colab_dir, 'cache')
dataset_path = os.path.join(colab_dir, 'dataset.csv')

print(f'Cache directory: {cache_dir}')

Cache directory: /content/drive/MyDrive/Colab Notebooks/cache


## Prepare Testing Dataset

In [None]:
template = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n"
    "{instruction}\n\n"
    "### Response:\n"
    "{output}"
)


def formatting_test(example):
    example['text'] = template.format(instruction=example['instruction'], output="")
    return example


dataset = load_dataset("csv", data_files=dataset_path, split='train')
print("[Raw Dataset]")
print(dataset)
print("-" * 40)

print("[Filtered Dataset]")
dataset = dataset.filter(
    lambda e: e['input'] is None or e['input'] == "",
    num_proc=2,                                             # multiprocessing
)
print(dataset)
print("-" * 40)

print("[Splitted Dataset]")
dataset = dataset.train_test_split(test_size=0.1, seed=0)   # set seed for reproducibility
print(dataset)
print("-" * 40)

print("[Test Subset]")
test_subset_indices = torch.randperm(
    len(dataset['test']),
    generator=torch.Generator().manual_seed(0),             # set seed for reproducibility
)[:100]
dataset_test_subset = dataset['test'].select(test_subset_indices)
dataset_test_subset = dataset_test_subset.map(formatting_test, num_proc=2)
print(dataset_test_subset)
print("-" * 40)

[Raw Dataset]
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 52002
})
----------------------------------------
[Filtered Dataset]
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 31338
})
----------------------------------------
[Splitted Dataset]
DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 28204
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 3134
    })
})
----------------------------------------
[Test Subset]
Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 100
})
----------------------------------------


## Load LLaMA2 Model

In [None]:
model_name = "meta-llama/Llama-2-7b-hf"

torch.cuda.reset_peak_memory_stats()

# doc: https://huggingface.co/docs/transformers/main_classes/quantization#transformers.BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# doc: https://huggingface.co/docs/transformers/v4.38.2/en/model_doc/auto#transformers.AutoModelForCausalLM.from_pretrained
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    trust_remote_code=True,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map={"": 0},
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token                   # The pretrained tokenizer for llama2 lacks of padding token
print(f"tokenizer.add_bos_token: {tokenizer.add_bos_token}")
print(f"tokenizer.add_eos_token: {tokenizer.add_eos_token}")

# doc: https://pytorch.org/docs/stable/generated/torch.cuda.max_memory_allocated.html#torch.cuda.max_memory_allocated
peak_memory = torch.cuda.max_memory_allocated(device)
peak_memory = peak_memory / 1024**3                         # Convert Byte to Gigabyte
print(f"Memory allocated: {peak_memory:.3}GB")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.add_bos_token: True
tokenizer.add_eos_token: False
Memory allocated: 3.73GB


## <font color='#EC7063'>TODO 3:</font> Define the Evaluation Function

In [None]:
def inference(model, tokenizer, instruction: str) -> str:
    # TODO
    return output


def evaluating(model, tokenizer, dataset) -> Tuple[List[str], List[str], Dict[str, float]]:
    model.eval()
    references = []
    outputs = []

    # TODO
    # for example in tqdm(dataset):
    #     ...

    rouge = evaluate.load('rouge')
    rouge_scores = rouge.compute(predictions=outputs, references=references)

    return references, outputs, rouge_scores

## Evalute Raw LLaMA2

In [None]:
references, outputs, rouge_scores = evaluating(model, tokenizer, dataset_test_subset)

for metric_name, value in rouge_scores.items():
    print(f"{metric_name}: {value * 100:.2f}")

  0%|          | 0/100 [00:00<?, ?it/s]

## Evaluate the Fine-tuned LLaMa2

In [None]:
model = PeftModel.from_pretrained(
    model,
    os.path.join(colab_dir, "checkpoint-last"),
    device_map={"": 0},
)
model.eval()
peak_memory = torch.cuda.max_memory_allocated(device)
peak_memory = peak_memory / 1024**3     # Convert Byte to Gigabyte
print(f"Memory allocated: {peak_memory:.3}GB")

Memory allocated: 4.21GB


In [None]:
references, outputs, rouge_scores = evaluating(model, tokenizer, dataset_test_subset)

for metric_name, value in rouge_scores.items():
    print(f"{metric_name}: {value * 100:.2f}")

NameError: name 'model' is not defined