<a href="https://colab.research.google.com/github/LaurynasRekasius/Domain_Name_Generator/blob/main/notebooks/LLM_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

IMPORTANT:
This notebookd runs only on A100 - requires Colab Pro subscription

# Setup Libraries & Data

In [1]:
!pip install transformers trl accelerate torch bitsandbytes peft -qU

from IPython.display import clear_output
import json
import requests

clear_output()

In [8]:
# URL of the raw file content
url = 'https://raw.githubusercontent.com/LaurynasRekasius/Domain_Name_Generator/main/data/train_dict.json'

# Send a GET request to the URL
response = requests.get(url)

train_chats = response.json()

print(f"File train_chats downloaded. It has {len(train_chats)} items")

# URL of the raw file content
url = 'https://raw.githubusercontent.com/LaurynasRekasius/Domain_Name_Generator/main/data/test_dict.json'

# Send a GET request to the URL
response = requests.get(url)

test_chats = response.json()

print(f"File train_chats downloaded. It has {len(test_chats)} items")


File train_chats downloaded. It has 3427 items
File train_chats downloaded. It has 1573 items


## Phi-2

### Format Prompt


In [3]:
def create_prompt(data_input):
  """
  Format prompt based on model requirements
  """

  system = " Provide 5 names in JSON based on description: "
  prompt = data_input['prompt']
  response = json.dumps(data_input['response'])


  full_prompt = ""
  full_prompt += "Instruct:"
  full_prompt += system
  full_prompt += prompt
  full_prompt += " Output: "
  full_prompt += """{"names": """ + str(response) + "}"

  return full_prompt

In [4]:
#example
create_prompt(test_chats[2])

'Instruct: Provide 5 names in JSON based on description: PixelPioneers: A visionary technology company revolutionizing digital media with cutting-edge image and video processing technologies, offering tailored solutions for businesses and creatives. Output: {"names": ["imagecrafter", "pixelalchemist", "visualgenius", "mediamorphosis", "pictureperfecttech"]}'

### Loading the Base Model

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "microsoft/phi-2"

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    quantization_config=nf4_config,
    use_cache=False
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

clear_output()

### Setting up the Training


In [6]:
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

In [7]:
from transformers import TrainingArguments

args = TrainingArguments(
  output_dir = "phi_2_dn",
  num_train_epochs=4,
  per_device_train_batch_size = 4,
  warmup_steps = 0.03,
  logging_steps=10,
  save_strategy="epoch",
  evaluation_strategy="epoch",
  eval_steps=20,
  learning_rate=2e-4,
  bf16=True,
  lr_scheduler_type='constant',
)

ValueError: Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0

In [None]:
from trl import SFTTrainer

max_seq_length = 2048

trainer = SFTTrainer(
  model=model,
  peft_config=peft_config,
  max_seq_length=max_seq_length,
  tokenizer=tokenizer,
  packing=True,
  formatting_func=create_prompt, # this will aplly the create_prompt mapping to all training and test dataset
  args=args,
  train_dataset=train_chats, # train data
  eval_dataset=test_chats # test data
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,1.1689,1.256043




Epoch,Training Loss,Validation Loss
1,1.1689,1.256043
2,1.0778,1.168343
3,1.012,1.123814
4,0.9664,1.091755




TrainOutput(global_step=164, training_loss=1.1393094324484103, metrics={'train_runtime': 658.509, 'train_samples_per_second': 0.978, 'train_steps_per_second': 0.249, 'total_flos': 2.154056701181952e+16, 'train_loss': 1.1393094324484103, 'epoch': 4.0})

### Save Model and Push to HF

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
new_model = "phi_2_dn"

trainer.save_model(new_model)
trainer.push_to_hub(new_model)

merged_model = model.merge_and_unload()
merged_model.push_to_hub(new_model)
tokenizer.push_to_hub(new_model)

events.out.tfevents.1710132869.34d552ead135.685.0:   0%|          | 0.00/9.99k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/294M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Soaky/phi_2_dn/commit/93adba7427b3a751f9cdde182ac586661eeb3da8', commit_message='phi_2_dn', commit_description='', oid='93adba7427b3a751f9cdde182ac586661eeb3da8', pr_url=None, pr_revision=None, pr_num=None)

## Mistral-7b

### Format Prompt

In [None]:
def create_prompt(data_input):
  """
  Format prompt based on model requirements
  """

  ins_start_token = "[INST]"
  system = "Your task is to generate a list of 5 names based on the business description: "
  prompt = data_input['prompt']
  response = json.dumps(data_inpu['response'])


  full_prompt = ""
  full_prompt += "<s>[INST] "
  full_prompt += system
  full_prompt += prompt
  full_prompt += " Respond in JSON format. [/INST] "
  full_prompt += """{"names": """ + str(response) + "}"
  full_prompt += " </s>"

  return full_prompt

In [None]:
#example
create_prompt(test_chats[1])

'<s>[INST] Your task is to generate a list of 5 names based on the business description: An innovative forestry company offering tree planting services, reforestation projects, and sustainable wood products. Respond in JSON format. [/INST] {"names": ["forestfuel", "treeteam", "ecogrowth", "woodwise", "greengrowth"]} </s>'

### Loading the Base Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    quantization_config=nf4_config,
    use_cache=False
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

clear_output()

### Setting up the Training

In [None]:
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
  output_dir = "mistral_7b_dn",
  num_train_epochs=4,
  per_device_train_batch_size = 4,
  warmup_steps = 0.03,
  logging_steps=10,
  save_strategy="epoch",
  evaluation_strategy="epoch",
  eval_steps=20,
  learning_rate=2e-4,
  bf16=True,
  lr_scheduler_type='constant',
)

In [None]:
from trl import SFTTrainer

max_seq_length = 2048

trainer = SFTTrainer(
  model=model,
  peft_config=peft_config,
  max_seq_length=max_seq_length,
  tokenizer=tokenizer,
  packing=True,
  formatting_func=create_prompt, # this will aplly the create_prompt mapping to all training and test dataset
  args=args,
  train_dataset=train_chats, # train data
  eval_dataset=test_chats # test data
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5613,0.661826
2,0.4963,0.636242
3,0.4466,0.638582
4,0.3959,0.635852




TrainOutput(global_step=216, training_loss=0.4871018915264695, metrics={'train_runtime': 710.3355, 'train_samples_per_second': 1.216, 'train_steps_per_second': 0.304, 'total_flos': 7.57821300843479e+16, 'train_loss': 0.4871018915264695, 'epoch': 4.0})

### Save Model and Push to HF

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
new_model = "Mistral_dn_fix"

trainer.save_model(new_model)
trainer.push_to_hub(new_model)

merged_model = model.merge_and_unload()
merged_model.push_to_hub(new_model)
tokenizer.push_to_hub(new_model)

adapter_model.safetensors:   0%|          | 0.00/109M [00:00<?, ?B/s]

events.out.tfevents.1710135296.34d552ead135.685.1:   0%|          | 0.00/5.21k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1710135370.34d552ead135.685.2:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/4.65G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.21k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Soaky/Mistral_dn_fix/commit/4ae80fd6425f4d353fed1be1c8291b14edce8cef', commit_message='Upload tokenizer', commit_description='', oid='4ae80fd6425f4d353fed1be1c8291b14edce8cef', pr_url=None, pr_revision=None, pr_num=None)