In [1]:
import torch
from datasets import Dataset, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import json
from sklearn.model_selection import train_test_split  # For splitting the dataset

In [2]:
# 1. Data Preparation
def load_data_from_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

def prepare_data_for_training(json_data):
    return {"text": [item["text"] for item in json_data]}

json_file_path = "completed_data.json"
raw_data = load_data_from_json(json_file_path)
training_data = prepare_data_for_training(raw_data)
dataset = Dataset.from_dict(training_data)


In [3]:
# 2. Split Dataset
train_data, val_data = train_test_split(dataset['text'], test_size=0.2, random_state=42)  # 80/20 split
train_dataset = Dataset.from_dict({"text": train_data}) # Convert training data to dataset object
val_dataset = Dataset.from_dict({"text": val_data})  # Convert validation data to dataset object


In [4]:

# 3. Tokenization
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

max_length = 256
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_length)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])


Map (num_proc=4):   0%|          | 0/1030 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/258 [00:00<?, ? examples/s]

In [5]:
# 4. LoRA Configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",]
)

In [6]:
# 5. Model Loading and Preparation
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_4bit=True
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 8912896 || all params: 2018053120 || trainable%: 0.4416581462434448


In [7]:
# Assuming you have 1288 data points in your *training* dataset
# And per_device_train_batch_size=1 and gradient_accumulation_steps=16
# Steps per epoch = (Training Data Size / Batch Size) / Gradient Accumulation Steps
# Steps per epoch = (1288 / 1) / 16 = 80.5
# Training for approximately 3 epochs: 80.5 * 3 = 241.5

training_args = TrainingArguments(
    output_dir="phi3-mini-algebra-tutor-v4",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    logging_steps=10,
    max_steps=242,  # Approximately 3 epochs - Adjust THIS value if needed
    save_steps=50, #Reduce number save steps otherwise its save the model after very long times
    eval_steps=50,  # Evaluate every 100 steps
    evaluation_strategy="steps", # Evaluate during training
    optim="paged_adamw_32bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    fp16=True,
    push_to_hub=False,
)



In [9]:
# 7. SFT Training with Validation Data
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset, # Pass validation dataset
    tokenizer=tokenizer,
    args=training_args,
    peft_config=lora_config,
)

  trainer = SFTTrainer(


In [10]:
trainer.train()
trainer.save_model()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
50,0.3694,0.351831
100,0.2887,0.269631
150,0.2165,0.224786
200,0.2081,0.20429




In [11]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Replace with the path to your *locally saved* fine-tuned model directory
model_path = "phi3-mini-algebra-tutor-big-data"  # *Your local LoRA adapter directory*
model_name = "microsoft/Phi-3-mini-4k-instruct" #Base model

# Replace with your Hugging Face username and desired repo name
hub_model_id = "alam1n/phi3-mini-algebra-tutor-v4"

# Load the base model (you might not need this if you saved the entire model)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_4bit=True # or use BitsAndBytesConfig if you used it for training
)

# Load the LoRA adapter (your fine-tuned weights)
model = PeftModel.from_pretrained(model, model_path)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Push the model and tokenizer to the Hub
model.push_to_hub(hub_model_id, commit_message="Add fine-tuned LoRA weights")
tokenizer.push_to_hub(hub_model_id, commit_message="Add tokenizer")

print(f"Model and tokenizer pushed to {hub_model_id}")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/35.7M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Model and tokenizer pushed to alam1n/phi3-mini-algebra-tutor-v4
