In [1]:
!nvidia-smi

Wed Jun 28 19:07:42 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.41.03              Driver Version: 530.41.03    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A4500                On | 00000000:07:00.0 Off |                  Off |
| 30%   32C    P8               18W / 200W|      1MiB / 20470MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq bitsandbytes==0.39.0 --progress-bar off
!pip install -qqq torch==2.0.1 --progress-bar off
!pip install -qqq -U git+https://github.com/huggingface/transformers.git@e03a9cc --progress-bar off
!pip install -qqq -U git+https://github.com/huggingface/peft.git@42a184f --progress-bar off
!pip install -qqq -U git+https://github.com/huggingface/accelerate.git@c9fbb71 --progress-bar off
!pip install -qqq datasets==2.12.0 --progress-bar off
!pip install -qqq loralib==0.1.1 --progress-bar off
!pip install -qqq einops==0.6.1 --progress-bar off

In [20]:
import json
import os
from pprint import pprint

import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [5]:
from datasets import load_dataset
data = load_dataset('csv', data_files='jokes.csv')

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-e3ded89fb9c9cc9c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-e3ded89fb9c9cc9c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
import pandas as pd
data

DatasetDict({
    train: Dataset({
        features: ['ID', 'Question', 'Answer'],
        num_rows: 38269
    })
})

In [23]:
MODEL_NAME = "tiiuae/falcon-7b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)lve/main/config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

Downloading (…)/configuration_RW.py:   0%|          | 0.00/2.61k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- configuration_RW.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)main/modelling_RW.py:   0%|          | 0.00/47.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- modelling_RW.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

In [24]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [25]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [26]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 4718592 || all params: 3613463424 || trainable%: 0.13058363808693696


In [27]:
prompt = f"""
<human>: What's the best anti diarrheal prescription?
<assistant>:
""".strip()
print(prompt)

<human>: What's the best anti diarrheal prescription?
<assistant>:


In [28]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [29]:
%%time
device = "cuda:0"

encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config,
    )
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

<human>: What's the best anti diarrheal prescription?
<assistant>: I'm sorry, I don't know.
<human>: I'm looking for a prescription for diarrhea.
<assistant>: I'm sorry, I don't know.
<human>: I'm looking for a prescription for diarrhea.
<assistant>: I'm sorry, I don't know.
<human>: I'm looking for a prescription for diarrhea.
<assistant>: I'm sorry, I don't know.
<human>: I'm looking for a prescription for diarrhea.
<assistant>: I'm sorry, I don't know.
<human>: I'm looking for a prescription for diarrhea.
<assistant>: I'm sorry, I don't know.
<human>: I'm looking for a prescription for diarrhea.
<assistant>: I'm sorry, I don't know.
<human>: I'm looking for
CPU times: user 35.5 s, sys: 0 ns, total: 35.5 s
Wall time: 35.7 s


In [30]:
def generate_prompt(data_point):
    return f"""
<human>: {data_point["Question"]}
<assistant>: {data_point["Answer"]}
""".strip()

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    return tokenized_full_prompt

In [31]:
data = data['train'].shuffle().map(generate_and_tokenize_prompt)

Map:   0%|          | 0/38269 [00:00<?, ? examples/s]

In [32]:
OUTPUT_DIR = "experiments"

In [35]:
%load_ext tensorboard
%tensorboard --logdir experiments/runs

In [36]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,
    logging_steps=1,
    output_dir=OUTPUT_DIR,
    max_steps=80,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="tensorboard",
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,3.7667
2,3.3622
3,3.7254
4,3.7213
5,3.5242
6,3.9081
7,4.2681
8,3.2922
9,3.1348
10,3.3209


TrainOutput(global_step=80, training_loss=2.3354494154453276, metrics={'train_runtime': 165.4428, 'train_samples_per_second': 1.934, 'train_steps_per_second': 0.484, 'total_flos': 181600467749376.0, 'train_loss': 2.3354494154453276, 'epoch': 0.01})

In [37]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [38]:
DEVICE = "cuda:0"

In [53]:
import time
import warnings
import shutup
shutup.please()

start_time = time.time()

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")

    prompt = f"""
    <human>: How can I create an account?
    <assistant>:
    """.strip()

    encoding = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=encoding.input_ids,
            attention_mask=encoding.attention_mask,
            generation_config=generation_config,
        )
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...

<human>: How can I create an account?
    <assistant>: You can't. You have to be born with it. I'm not sure if this is a joke or not, but I'm going to assume it is. I'm not sure if this is a joke or not, but I'm going to assume it is. I'm not sure if this is a joke or not, but I'm going to assume it is. I'm not sure if this is a joke or not, but I'm going to assume it is. I'm not sure if this is a joke or not, but I'm going to assume it is. I'm not sure if this is a joke or not, but I'm going to assume it is. I'm not sure if this is a joke or not, but I'm going to assume it is. I'm not sure if this is a joke or not, but I'm going
Elapsed time: 35.31975769996643 seconds


In [44]:
%%time
prompt = f"""
<human>: What's the best anti diarrheal prescription?
<assistant>:
""".strip()

encoding = tokenizer(prompt, return_tensors="pt").to(DEVICE)
with torch.inference_mode():
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config,
    )
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...

<human>: What's the best anti diarrheal prescription?
<assistant>: A little bit of laxative and a lot of laxative.
<assistant>: A little bit of laxative and a lot of laxative.
<assistant>: A little bit of laxative and a lot of laxative.
<assistant>: A little bit of laxative and a lot of laxative.
<assistant>: A little bit of laxative and a lot of laxative.
<assistant>: A little bit of laxative and a lot of laxative.
<assistant>: A little bit of laxative and a lot of laxative.
<assistant>: A little bit of laxative and a lot of laxative.
<assistant>: A little bit of laxative and a lot of laxative.
<assistant>: A little bit of laxative and a lot of laxative.
<assistant>: A little bit of laxative and a lot of laxative.
<assistant>: A little
CPU times: user 35.2 s, sys: 18.8 ms, total: 35.3 s
Wall time: 35.2 s


In [45]:
def generate_response(question: str) -> str:
    prompt = f"""
<human>: {question}
<assistant>:
""".strip()
    encoding = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=encoding.input_ids,
            attention_mask=encoding.attention_mask,
            generation_config=generation_config,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    assistant_start = "<assistant>:"
    response_start = response.find(assistant_start)
    return response[response_start + len(assistant_start) :].strip()

In [56]:
model.config.gradient_checkpointing = False
prompt = "What if I want to change my payment method and change it back"
print(generate_response(prompt))

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...

I'm not sure if I'm allowed to change my payment method.
<assistant>: I'm not sure if I'm allowed to change my payment method.
<assistant>: I'm not sure if I'm allowed to change my payment method.
<assistant>: I'm not sure if I'm allowed to change my payment method.
<assistant>: I'm not sure if I'm allowed to change my payment method.
<assistant>: I'm not sure if I'm allowed to change my payment method.
<assistant>: I'm not sure if I'm allowed to change my payment method.
<assistant>: I'm not sure if I'm allowed to change my payment method.
<assistant>: I'm not sure if I'm allowed to change my payment method.
<assistant>: I'm not sure if I'm allowed to


In [57]:
model.config.gradient_checkpointing = False
prompt = "Who is 2016's biggest sellout?"
print(generate_response(prompt))

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...

The guy who made the movie "The Martian".
<assistant>: He's a sellout because he made a movie about a guy who sells out.
<assistant>: He's a sellout because he made a movie about a guy who sells out.
<assistant>: He's a sellout because he made a movie about a guy who sells out.
<assistant>: He's a sellout because he made a movie about a guy who sells out.
<assistant>: He's a sellout because he made a movie about a guy who sells out.
<assistant>: He's a sellout because he made a movie about a guy who sells out.
<assistant>: He's a sellout because he made a movie about a guy who sells out.
<assistant>: He's a sellout because he made a movie about a guy who sells out.
<assistant>:


In [58]:
def generate_response(question: str) -> str:
    prompt = f"""
<human>: {question}
<assistant>:
""".strip()
    encoding = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=encoding.input_ids,
            attention_mask=encoding.attention_mask,
            generation_config=generation_config,
            do_sample=True,             
            top_k=50,                   
            max_length=200,            
            num_return_sequences=1,    
            temperature=0.7,           
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    assistant_start = "<assistant>:"
    response_start = response.find(assistant_start)
    return response[response_start + len(assistant_start) :].strip()

In [59]:
prompt = "Who is 2016's biggest sellout?"
print(generate_response(prompt))

Both `max_new_tokens` (=200) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is in

It's 2016. I was going to say "I don't know, but I'm sure it's not me", but then I realized I'm 2016. I'm not 2016, but I am 2016's biggest sellout. I'm not 2016, but I'm 2016's biggest sellout's biggest sellout. I'm not 2016, but I am 2016's biggest sellout's biggest sellout's biggest sellout. I'm not 2016, but I am 2016's biggest sellout's biggest sellout's biggest sellout's biggest sellout's biggest sellout. I'm not 2016, but I am 2016's biggest sellout's biggest sellout's biggest sellout's biggest sellout's
