<a href="https://colab.research.google.com/github/LaurynasRekasius/Domain_Name_Generator/blob/main/notebooks/LLM_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

IMPORTANT:
This notebookd runs only on A100 - requires Colab Pro subscription

# Setup Libraries & Data

In [1]:
!pip install transformers trl accelerate torch bitsandbytes peft -qU

from IPython.display import clear_output
import json
import requests

clear_output()

In [2]:
# URL of the raw file content
url = 'https://raw.githubusercontent.com/LaurynasRekasius/Domain_Name_Generator/main/data/train_dict.json'

# Send a GET request to the URL
response = requests.get(url)

train_chats = response.json()

print(f"File train_chats downloaded. It has {len(train_chats)} items")

# URL of the raw file content
url = 'https://raw.githubusercontent.com/LaurynasRekasius/Domain_Name_Generator/main/data/test_dict.json'

# Send a GET request to the URL
response = requests.get(url)

test_chats = response.json()

print(f"File train_chats downloaded. It has {len(test_chats)} items")


File train_chats downloaded. It has 3427 items
File train_chats downloaded. It has 1573 items


## Phi-2

### Format Prompt


In [3]:
def create_prompt(data_input):
  """
  Format prompt based on model requirements
  """

  system = " Provide 5 names in JSON based on description: "
  prompt = data_input['prompt']
  response = json.dumps(data_input['response'])


  full_prompt = ""
  full_prompt += "Instruct:"
  full_prompt += system
  full_prompt += prompt
  full_prompt += " Output: "
  full_prompt += """{"names": """ + str(response) + "}"

  return full_prompt

In [4]:
#example
create_prompt(test_chats[2])

'Instruct: Provide 5 names in JSON based on description: PixelPioneers: A visionary technology company revolutionizing digital media with cutting-edge image and video processing technologies, offering tailored solutions for businesses and creatives. Output: {"names": ["imagecrafter", "pixelalchemist", "visualgenius", "mediamorphosis", "pictureperfecttech"]}'

### Loading the Base Model

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "microsoft/phi-2"

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    quantization_config=nf4_config,
    use_cache=False
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

clear_output()

### Setting up the Training


In [6]:
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

In [10]:
from transformers import TrainingArguments

args = TrainingArguments(
  output_dir = "phi_2_dn",
  num_train_epochs=5,
  per_device_train_batch_size = 4,
  warmup_steps = 0.03,
  logging_steps=10,
  save_strategy="epoch",
  evaluation_strategy="epoch",
  eval_steps=20,
  learning_rate=2e-4,
  bf16=True,
  lr_scheduler_type='constant',
)

In [11]:
from trl import SFTTrainer

max_seq_length = 2048

trainer = SFTTrainer(
  model=model,
  peft_config=peft_config,
  max_seq_length=max_seq_length,
  tokenizer=tokenizer,
  packing=True,
  formatting_func=create_prompt, # this will aplly the create_prompt mapping to all training and test dataset
  args=args,
  train_dataset=train_chats, # train data
  eval_dataset=test_chats # test data
)

In [12]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,1.1024,1.145145
2,1.0193,1.058918
3,0.9603,1.014993
4,0.923,0.981921
5,0.8995,0.961854




TrainOutput(global_step=215, training_loss=1.040920809812324, metrics={'train_runtime': 897.5283, 'train_samples_per_second': 0.941, 'train_steps_per_second': 0.24, 'total_flos': 2.82636321816576e+16, 'train_loss': 1.040920809812324, 'epoch': 5.0})

### Save Model and Push to HF

In [13]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
new_model = "phi_2_dn_V2"

trainer.save_model(new_model)
trainer.push_to_hub(new_model)

merged_model = model.merge_and_unload()
merged_model.push_to_hub(new_model)
tokenizer.push_to_hub(new_model)

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/294M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

events.out.tfevents.1710188286.4b42c46ae487.1688.0:   0%|          | 0.00/11.3k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Soaky/phi_2_dn_V2/commit/8d329c1fae26f97f2eb7e2fd3c9d0667feedc506', commit_message='Upload tokenizer', commit_description='', oid='8d329c1fae26f97f2eb7e2fd3c9d0667feedc506', pr_url=None, pr_revision=None, pr_num=None)

## Mistral-7b

### Format Prompt

In [17]:
def create_prompt(data_input):
  """
  Format prompt based on model requirements
  """

  ins_start_token = "[INST]"
  system = "Your task is to generate a list of 5 names based on the business description: "
  prompt = data_input['prompt']
  response = json.dumps(data_input['response'])


  full_prompt = ""
  full_prompt += "<s>[INST] "
  full_prompt += system
  full_prompt += prompt
  full_prompt += " Respond in JSON format. [/INST] "
  full_prompt += """{"names": """ + str(response) + "}"
  full_prompt += " </s>"

  return full_prompt

In [18]:
#example
create_prompt(test_chats[1])

'<s>[INST] Your task is to generate a list of 5 names based on the business description: PixelPioneers: A visionary technology company, merging the worlds of art and technology to create immersive digital experiences. We specialize in virtual and augmented reality, 3D modeling, and animation, transforming brands and stories into captivating visual journeys. Respond in JSON format. [/INST] {"names": ["realityforge", "pixelfusion", "augmentart", "virtualvoyage", "immersiveideas"]} </s>'

### Loading the Base Model

In [19]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    quantization_config=nf4_config,
    use_cache=False
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

clear_output()

### Setting up the Training

In [20]:
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

In [21]:
from transformers import TrainingArguments

args = TrainingArguments(
  output_dir = "mistral_7b_dn",
  num_train_epochs=4,
  per_device_train_batch_size = 4,
  warmup_steps = 0.03,
  logging_steps=10,
  save_strategy="epoch",
  evaluation_strategy="epoch",
  eval_steps=20,
  learning_rate=2e-4,
  bf16=True,
  lr_scheduler_type='constant',
)

In [22]:
from trl import SFTTrainer

max_seq_length = 2048

trainer = SFTTrainer(
  model=model,
  peft_config=peft_config,
  max_seq_length=max_seq_length,
  tokenizer=tokenizer,
  packing=True,
  formatting_func=create_prompt, # this will aplly the create_prompt mapping to all training and test dataset
  args=args,
  train_dataset=train_chats, # train data
  eval_dataset=test_chats # test data
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [23]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.6457,0.644759
2,0.5458,0.592459
3,0.5126,0.569824
4,0.4575,0.553187




TrainOutput(global_step=220, training_loss=0.6016638127240268, metrics={'train_runtime': 750.0311, 'train_samples_per_second': 1.173, 'train_steps_per_second': 0.293, 'total_flos': 7.718550286368768e+16, 'train_loss': 0.6016638127240268, 'epoch': 4.0})

### Save Model and Push to HF

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [24]:
new_model = "Mistral_dn_V2"

trainer.save_model(new_model)
trainer.push_to_hub(new_model)

merged_model = model.merge_and_unload()
merged_model.push_to_hub(new_model)
tokenizer.push_to_hub(new_model)

events.out.tfevents.1710189527.4b42c46ae487.1688.1:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/109M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/4.65G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.21k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Soaky/Mistral_dn_V2/commit/81c2edcb65174c883efd18cc1a1369573bfc900d', commit_message='Upload tokenizer', commit_description='', oid='81c2edcb65174c883efd18cc1a1369573bfc900d', pr_url=None, pr_revision=None, pr_num=None)