In [6]:
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [7]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
!gdown 1u85RQZdRTmpjGKcCc5anCMAHZ-um4DUC

Downloading...
From: https://drive.google.com/uc?id=1u85RQZdRTmpjGKcCc5anCMAHZ-um4DUC
To: /content/ecommerce-faq.json
  0% 0.00/21.0k [00:00<?, ?B/s]100% 21.0k/21.0k [00:00<00:00, 59.1MB/s]


In [9]:
with open("ecommerce-faq.json") as json_file:
    data = json.load(json_file)

In [10]:
pprint(data["questions"][0], sort_dicts=False)

{'question': 'How can I create an account?',
 'answer': "To create an account, click on the 'Sign Up' button on the top "
           'right corner of our website and follow the instructions to '
           'complete the registration process.'}


In [12]:
MODEL_NAME = "tiiuae/falcon-7b"

# Set up the quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:  30%|###       | 3.02G/9.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

In [17]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

In [18]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


In [19]:
# Set up the LoRA configuration
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)


In [20]:
with open("/content/ecommerce-faq.json", "r") as f:
    data = json.load(f)
flattened_data = data['questions']  # Extract the list of question-answer pairs

# Save the flattened data
with open("/content/flattened_ecommerce_faq.json", "w") as f:
    json.dump(flattened_data, f, indent=4)

# Load the dataset
dataset = load_dataset("json", data_files="/content/flattened_ecommerce_faq.json")

Generating train split: 0 examples [00:00, ? examples/s]

In [21]:
# Define prompt generation functions
def generate_prompt(data_point):
    return f"""
<human>: {data_point["question"]}
<assistant>: {data_point["answer"]}
""".strip()

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    return tokenized_full_prompt


Map:   0%|          | 0/79 [00:00<?, ? examples/s]

In [22]:
data = dataset["train"].shuffle().map(generate_and_tokenize_prompt)
data

Map:   0%|          | 0/79 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask'],
    num_rows: 79
})

In [47]:
OUTPUT_DIR = "experiments"

training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,
    logging_steps=1,
    output_dir=OUTPUT_DIR,
    max_steps=80,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="tensorboard",
)

In [48]:
model.config.use_cache = False

trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()

max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)


Step,Training Loss
1,0.2167
2,0.2426
3,0.2192
4,0.2316
5,0.3236
6,0.2286
7,0.241
8,0.3524
9,0.3431
10,0.3094


TrainOutput(global_step=80, training_loss=0.20134220626205207, metrics={'train_runtime': 458.7675, 'train_samples_per_second': 0.698, 'train_steps_per_second': 0.174, 'total_flos': 674369673408000.0, 'train_loss': 0.20134220626205207, 'epoch': 4.050632911392405})

In [49]:
model.save_pretrained("trained-model")

In [50]:
model.push_to_hub(
    "ItzShahzaib/falcon-7b-qlora-chat-support-bot-faq", use_auth_token=True
)



adapter_model.safetensors:   0%|          | 0.00/18.9M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ItzShahzaib/falcon-7b-qlora-chat-support-bot-faq/commit/e93a40e1c3f1d9c505f51abe04e68fc8030a2a80', commit_message='Upload model', commit_description='', oid='e93a40e1c3f1d9c505f51abe04e68fc8030a2a80', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ItzShahzaib/falcon-7b-qlora-chat-support-bot-faq', endpoint='https://huggingface.co', repo_type='model', repo_id='ItzShahzaib/falcon-7b-qlora-chat-support-bot-faq'), pr_revision=None, pr_num=None)

In [53]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [54]:
model.config.use_cache = True
model.gradient_checkpointing_disable()


You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


In [57]:
DEVICE = "cuda:0"
prompt = """
<human>: How can I create an account?
<assistant>:
""".strip()

encoding = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(DEVICE)


In [58]:
outputs = model.generate(
    input_ids=encoding.input_ids,
    attention_mask=encoding.attention_mask,
    max_new_tokens=200,
    temperature=0.7,
    top_p=0.7,
    num_return_sequences=1,
    do_sample=True,  # Enable sampling
)

In [59]:
def generate_response(question: str) -> str:
    prompt = f"""
<human>: {question}
<assistant>:
""".strip()
    encoding = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=encoding.input_ids,
            attention_mask=encoding.attention_mask,
            generation_config=generation_config,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    assistant_start = "<assistant>:"
    response_start = response.find(assistant_start)
    return response[response_start + len(assistant_start) :].strip()

In [64]:
prompt = "How do I know when I'll receive my order?"
print(generate_response(prompt))

Once you have completed your order, please refer to the order confirmation email for an update on your shipment status and estimated delivery date. If you do not receive this email, please contact our customer support team for assistance.
