In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/chat-data/chat_data.json
/kaggle/input/last-data/chat_data.json


In [2]:
import pandas as pd
import json
# Load your data
with open('/kaggle/input/last-data/chat_data.json', 'r') as file:
    data = json.load(file)['data']

# Prepare the data
train_data = []
for item in data:
    for pattern in item['patterns']:
        for response in item['responses']:
            train_data.append({"prompt": pattern, "response": response})

            
df = pd.DataFrame(train_data)
df = df.drop_duplicates()
df.head()


Unnamed: 0,prompt,response
0,Hi,Hello
1,Hi,Hi
2,Hi,Hi there
3,Hi,Hey
4,Hey,Hello


In [3]:
# Split the data into train and test sets, with 90% in the train set
train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

# Save the dataframes to .jsonl files
train_df.to_json('train.json', orient='records', lines=True)
test_df.to_json('test.json', orient='records', lines=True)

In [4]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.33.1 trl==0.4.7
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

2024-07-06 06:06:44.996184: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-06 06:06:44.996328: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-06 06:06:45.125316: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
model_name = "NousResearch/llama-2-7b-chat-hf" 
dataset_name = "/kaggle/working/train.json"
new_model = "SmartTour_llama-2-7b"
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
output_dir = "./results"
num_train_epochs = 2
fp16 = False
bf16 = False
per_device_train_batch_size = 2
per_device_eval_batch_size = 2
gradient_accumulation_steps = 2
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 25
logging_steps = 5
max_seq_length = None
packing = False
device_map = {"": 0}




In [11]:
# Load datasets
train_dataset = load_dataset('json', data_files='/kaggle/working/train.json', split="train")
valid_dataset = load_dataset('json', data_files='//kaggle/working/test.json', split="train")

# Preprocess datasets|
system_message = "Your system message goes here"
train_dataset_mapped = train_dataset.map(lambda examples: {'text': [f'[INST] <<SYS>>\n{system_message.strip()}\n<</SYS>>\n\n' + prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)
valid_dataset_mapped = valid_dataset.map(lambda examples: {'text': [f'[INST] <<SYS>>\n{system_message.strip()}\n<</SYS>>\n\n' + prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)


In [12]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="all",
    evaluation_strategy="steps",
    eval_steps=5  # Evaluate every 20 steps
)


In [14]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_mapped,
    eval_dataset=valid_dataset_mapped,  # Pass validation dataset here
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)
trainer.train()
trainer.model.save_pretrained(new_model)




Map:   0%|          | 0/17 [00:00<?, ? examples/s]

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
5,2.6418,2.447397
10,3.4363,2.014841
15,2.0252,1.749565
20,1.6388,1.586969
25,1.6586,1.468375
30,1.2262,1.407284
35,1.3971,1.314094
40,0.9604,1.246057
45,1.2503,1.215665
50,0.8632,1.177102




In [15]:
logging.set_verbosity(logging.CRITICAL)
prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\nWhat languages are commonly spoken in Egypt? [/INST]" 
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
print(result[0]['generated_text'])



[INST] <<SYS>>
Your system message goes here
<</SYS>>

What languages are commonly spoken in Egypt? [/INST] Arabic is the official language, but English is widely spoken, especially in tourist areas. Other languages spoken include Egyptian Arabic, Greek, French, German, Italian, and Spanish. Knowing some basic Arabic phrases can be helpful, especially when communicating with locals. English is increasingly used in tourism and business. Understanding some basic Arabic phrases can be helpful, especially when communicating with locals.


In [16]:
from transformers import pipeline

prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\nwhat do you know about egypt[/INST]" # replace the command here with something relevant to your task
num_new_tokens = 100  # change to the number of new tokens you want to generate

# Count the number of tokens in the prompt
num_prompt_tokens = len(tokenizer(prompt)['input_ids'])

# Calculate the maximum length for the generation
max_length = num_prompt_tokens + num_new_tokens

gen = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length=max_length)
result = gen(prompt)
print(result[0]['generated_text'].replace(prompt, ''))

 Egypt, officially known as the Arab Republic of Egypt, is a country located in northeastern Africa. It is bordered by the Mediterranean Sea to the north, the Red Sea to the east, Sudan to the south, and Libya to the west. Egypt is home to over 100 million people, making it the most populous country in Africa and the Arab world. The capital and largest city is Cairo, which is home to over 20% of


In [None]:
# Directory to save the fine-tuned model and tokenizer
fine_tuned_model_dir = "SmartTour_llama-2-7b"

# Save the tokenizer
tokenizer.save_pretrained(fine_tuned_model_dir)

# Save the model configuration
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_name)
config.save_pretrained(fine_tuned_model_dir)

# Save the model itself
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_name)
model.save_pretrained(fine_tuned_model_dir)
