<a href="https://colab.research.google.com/github/HAL22/llm-tutorial/blob/finetune-dolly/Finetune_Dolly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetuning Dolly GPT-J-6B with LoRa

LoRa paper - https://arxiv.org/abs/2106.09685

In [1]:
!git clone https://github.com/gururise/AlpacaDataCleaned.git

Cloning into 'AlpacaDataCleaned'...
remote: Enumerating objects: 747, done.[K
remote: Counting objects: 100% (124/124), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 747 (delta 64), reused 94 (delta 53), pack-reused 623[K
Receiving objects: 100% (747/747), 76.51 MiB | 21.01 MiB/s, done.
Resolving deltas: 100% (411/411), done.
Updating files: 100% (69/69), done.


In [2]:
ls AlpacaDataCleaned/

alpaca_data_cleaned_archive.json  [0m[01;34meval[0m/                    README.md
alpaca_data_cleaned.json          generate_instruction.py  requirements.txt
alpaca_data.json                  [01;34mgui[0m/                     schema.json
alpacaModifier.py                 LICENSE                  seed_tasks.jsonl
[01;34massets[0m/                           modifierGui.py           [01;34mtools[0m/
DATA_LICENSE                      prompt.txt               utils.py
[01;34mdataset_extensions[0m/               pyproject.toml


In [3]:
!pip install -q datasets loralib sentencepiece
!pip uninstall transformers
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip -q install git+https://github.com/huggingface/peft.git
!pip -q install bitsandbytes


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m43.8 MB/s[0m eta

## Load Tokenizer

In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")


# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = tokenizer.eos_token_id

data = load_dataset("json", data_files="./AlpacaDataCleaned/alpaca_data.json")


def generate_prompt(data_point):
    # taken from https://github.com/tloen/alpaca-lora
    if data_point["instruction"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Response:
{data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Response:
{data_point["output"]}"""




Downloading (…)okenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [11]:
data['train'][10]

{'output': 'Julius Caesar was assassinated by a group of up to 60 conspirators, led by Gaius Cassius Longinus and Marcus Junius Brutus, in the Senate House on the Ides of March (15 March) of 44 BC.',
 'input': '',
 'instruction': 'How did Julius Caesar die?'}

In [12]:
data = data.map(lambda data_point: {"prompt": tokenizer(generate_prompt(data_point))})

data

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction', 'prompt'],
        num_rows: 52002
    })
})

## Finetuning Dolly

In [13]:
import os

# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, GPTJForCausalLM

from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model

In [14]:
# Settings for A100 - For 3090
MICRO_BATCH_SIZE = 4  # change to 4 for 3090
BATCH_SIZE = 128
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 1  # paper uses 3
LEARNING_RATE = 2e-5
CUTOFF_LEN = 256
LORA_R = 4
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

In [None]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B",
                                          add_eos_token=True,
                                          )

model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B",
                                  load_in_8bit=True,
                                  device_map="auto",
                                  )


model = prepare_model_for_int8_training(model, use_gradient_checkpointing=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

In [None]:
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token

data = load_dataset("json", data_files="./AlpacaDataCleaned/alpaca_data_cleaned.json")



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:


data = data.shuffle().map(
    lambda data_point: tokenizer(
        generate_prompt(data_point),
        truncation=True,
        max_length=CUTOFF_LEN,
        padding="max_length",
    )
)

Map:   0%|          | 0/51759 [00:00<?, ? examples/s]

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'input_ids', 'attention_mask'],
        num_rows: 51759
    })
})

In [None]:

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=100,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=1,
        output_dir="lora-dolly",
        save_total_limit=3,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)

model.save_pretrained("gptj6b-lora-dolly")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.3886
2,2.3041
3,2.3462
4,2.333
5,2.2798
6,2.3429
7,2.3024
8,2.4163
9,2.3456
10,2.3264




In [None]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
model.push_to_hub("samwit/dolly-lora", use_auth_token=True)

adapter_model.bin:   0%|          | 0.00/7.38M [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/samwit/dolly-lora/commit/cf5418849e8711af5cefea64722a83f53a0d366f', commit_message='Upload model', commit_description='', oid='cf5418849e8711af5cefea64722a83f53a0d366f', pr_url=None, pr_revision=None, pr_num=None)