In [None]:
from unsloth import FastLanguageModel
import torch
import pandas as pd

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Ref: https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(1B_and_3B)-Conversational.ipynb#scrollTo=kR3gIAX-SM2q

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.381 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.2.15 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
my_dataset = pd.read_csv('../dataset/cleaned_dataset_answer_improved_reasoned.csv')
my_dataset.rename(columns={'Question': 'question', 'Answer': 'answer'}, inplace=True)
my_dataset.head()

Unnamed: 0,question,answer,reasoning
0,Is a high school diploma required for an F-1 v...,A high school diploma or its equivalent is gen...,Question Understanding\nThe question asks whet...
1,Is it important to memorize my SEVIS ID?,"It's crucial to know your SEVIS ID, as it's yo...",Question Understanding\nThe question asks abou...
2,Is proof of housing required at the port of en...,While proof of housing is not always required ...,Question Understanding\nThe question asks whet...
3,What document does a school provide for an F-1...,"A school provides Form I-20, a Certificate of ...",Question Understanding\nThe question asks abou...
4,What if I plan to do research collaboration wi...,If asked about potential research collaboratio...,Question Understanding\nThe question concerns ...


In [None]:
from datasets import Dataset
import pandas as pd

# Convert each row to the required conversation format
formatted_data = []
for _, row in my_dataset.iterrows():
    formatted_data.append({
        'conversations': [
            {'from': 'human', 'value': row['question']},
            {'from': 'gpt', 'value': row['answer']}
        ]
    })

# Create Hugging Face dataset
dataset = Dataset.from_list(formatted_data)

# To verify the structure
print(dataset[0]['conversations'])

[{'from': 'human', 'value': 'Is a high school diploma required for an F-1 visa?'}, {'from': 'gpt', 'value': 'A high school diploma or its equivalent is generally required for an F-1 visa if you intend to pursue academic studies at a college, university, or other post-secondary institution. However, if you plan to enroll in a vocational or non-academic program, a high school diploma might not be mandatory, but meeting the educational requirements of that specific program is still essential. Always verify the specific admission requirements of the educational institution you plan to attend to ensure compliance with F-1 visa eligibility criteria.'}]


In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass


In [None]:
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

Standardizing format:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
dataset[5]["conversations"]

[{'content': 'Is the visa interview the same for master’s and bachelor’s applicants?',
  'role': 'user'},
 {'content': "While the general visa interview process is similar for both master's and bachelor's applicants, the content and focus can differ. Graduate applicants should expect more in-depth questions regarding their chosen field of study, research experience (if applicable), and long-term career objectives. Visa officers may also probe into how the graduate program aligns with the applicant's previous education and future aspirations.",
  'role': 'assistant'}]

In [None]:
dataset[5]["text"]

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nIs the visa interview the same for master’s and bachelor’s applicants?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWhile the general visa interview process is similar for both master's and bachelor's applicants, the content and focus can differ. Graduate applicants should expect more in-depth questions regarding their chosen field of study, research experience (if applicable), and long-term career objectives. Visa officers may also probe into how the graduate program aligns with the applicant's previous education and future aspirations.<|eot_id|>"

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 10,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 100, # Set this for 1 full training run.
        #max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Converting train dataset to ChatML (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

"<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nIs the visa interview the same for master’s and bachelor’s applicants?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWhile the general visa interview process is similar for both master's and bachelor's applicants, the content and focus can differ. Graduate applicants should expect more in-depth questions regarding their chosen field of study, research experience (if applicable), and long-term career objectives. Visa officers may also probe into how the graduate program aligns with the applicant's previous education and future aspirations.<|eot_id|>"

In [None]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

"                                                 \n\nWhile the general visa interview process is similar for both master's and bachelor's applicants, the content and focus can differ. Graduate applicants should expect more in-depth questions regarding their chosen field of study, research experience (if applicable), and long-term career objectives. Visa officers may also probe into how the graduate program aligns with the applicant's previous education and future aspirations.<|eot_id|>"

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 100
O^O/ \_/ \    Batch size per device = 10 | Gradient Accumulation steps = 4
\        /    Total batch size = 40 | Total steps = 2,500
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.6122
2,1.6872
3,1.7073
4,1.5406
5,1.4875
6,1.4422
7,1.3766
8,1.332
9,1.3201
10,1.2972


In [None]:
test_dataset = pd.read_csv('../dataset/cleaned_dataset_answer_improved_reasoned.csv')
test_dataset.rename(columns={'Question': 'question', 'Answer': 'answer'}, inplace=True)

In [None]:
sample = test_dataset.iloc[500]['question']
sample

'Are travel agencies reliable sources for visa counseling?'

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Are travel agencies reliable sources for visa counseling?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAre travel agencies reliable sources for visa counseling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWhile travel agencies can assist with travel arrangements, they are not reliable sources for visa counseling. For accurate and up-to-date information on visa regulations, requirements, and application procedures, always consult official government websites and the relevant embassy or consulate. Relying solely on travel agencies may lead to misinformation and jeopardize your visa application']

In [None]:
test_dataset.iloc[500]['answer']

'While travel agencies can assist with travel arrangements, they are not reliable sources for visa counseling. For accurate and up-to-date information on visa regulations, requirements, and application procedures, always consult official government websites and the relevant embassy or consulate. Relying solely on travel agencies may lead to misinformation and jeopardize your visa application.'

In [None]:
model.save_pretrained("Meta-Llama-3.1-8B-Instruct-law-lora_model") # Local saving
tokenizer.save_pretrained("Meta-Llama-3.1-8B-Instruct-law-lora_model")

('Meta-Llama-3.1-8B-Instruct-law-lora_model/tokenizer_config.json',
 'Meta-Llama-3.1-8B-Instruct-law-lora_model/special_tokens_map.json',
 'Meta-Llama-3.1-8B-Instruct-law-lora_model/tokenizer.json')

In [None]:
model.push_to_hub("Jenitza182/Meta-Llama-3.1-8B-Instruct-law-lora_model") # Online saving
tokenizer.push_to_hub("Jenitza182/Meta-Llama-3.1-8B-Instruct-law-lora_model") # Online saving

README.md:   0%|          | 0.00/625 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/Jenitza182/Meta-Llama-3.1-8B-Instruct-law-lora_model


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
# To Load Model form HF
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "Jenitza182/Meta-Llama-3.1-8B-Instruct-law-lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Are travel agencies reliable sources for visa counseling?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 2056,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

In [None]:
# Load Model in HF : Option 2
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "Jenitza182/Qwen2.5-7B-Instruct-law-lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("Jenitza182/Qwen2.5-7B-Instruct-law-lora_model")