In [1]:
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import json

# 1. Data Preparation (same as before)
def load_data_from_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

def prepare_data_for_training(json_data):
    return {"text": [item["text"] for item in json_data]}

json_file_path = "data_675.json"
raw_data = load_data_from_json(json_file_path)
training_data = prepare_data_for_training(raw_data)
dataset = Dataset.from_dict(training_data)

# 2. Tokenization (with reduced max_length)
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

max_length = 256  # Reduced max_length
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_length)

tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

# 3. LoRA Configuration (with target_modules)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",]  # Corrected names
)

# 4. Model Loading and Preparation (with 4-bit quantization)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,  # or torch.float16 if bf16 not supported
    device_map="auto",
    load_in_4bit=True  # Enable 4-bit quantization
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

# # 5. SFT Training (with gradient accumulation and reduced batch size)
# training_args = TrainingArguments(
#     output_dir="phi3-mini-algebra-tutor",
#     per_device_train_batch_size=1,  # Reduced batch size
#     gradient_accumulation_steps=16,  # Increased gradient accumulation
#     learning_rate=2e-4,
#     logging_steps=10,
#     max_steps=500,
#     save_steps=100,
#     optim="paged_adamw_32bit",
#     lr_scheduler_type="cosine",
#     warmup_ratio=0.03,
#     fp16=True,  # or bf16=True
#     push_to_hub=False,
# )

# trainer = SFTTrainer(
#     model=model,
#     train_dataset=tokenized_datasets,
    
#     tokenizer=tokenizer,
#     args=training_args,
#     peft_config=lora_config,
# )

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/675 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

trainable params: 8912896 || all params: 2018053120 || trainable%: 0.4416581462434448


In [2]:

# 5. SFT Training (with gradient accumulation and reduced batch size)
training_args = TrainingArguments(
    output_dir="phi3-mini-algebra-tutor",
    per_device_train_batch_size=1,  # Reduced batch size
    gradient_accumulation_steps=16,  # Increased gradient accumulation
    learning_rate=2e-4,
    logging_steps=10,
    max_steps=500,
    save_steps=100,
    optim="paged_adamw_32bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    fp16=True,  # or bf16=True
    push_to_hub=False,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets,
    
    tokenizer=tokenizer,
    args=training_args,
    peft_config=lora_config,
)

  trainer = SFTTrainer(


In [3]:
trainer.train()
trainer.save_model()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,1.3893
20,0.8574
30,0.5519
40,0.4488
50,0.3987
60,0.3565
70,0.3254
80,0.2824
90,0.264
100,0.2601




In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# 1. Model and Tokenizer Loading
model_name = "microsoft/Phi-3-mini-4k-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_4bit=True
)
model = PeftModel.from_pretrained(model, "phi3-mini-algebra-tutor")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Get the special token IDs
end_token_id = tokenizer.convert_tokens_to_ids(["<|end|>"])[0]
user_token_id = tokenizer.convert_tokens_to_ids(["<|user|>"])[0]
assistant_token_id = tokenizer.convert_tokens_to_ids(["<|assistant|>"])[0]


# 2. Modified Inference Function for Interactive Dialogue
def generate_next_step(prompt):
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **input_ids,
            max_new_tokens=100,  # Adjusted to 100, experiment with this
            do_sample=True,
            top_p=0.9,
            temperature=0.7,
            repetition_penalty=1.15,
            eos_token_id=end_token_id,  # Use the correct <|end|> token ID
            pad_token_id=end_token_id # Use the correct <|end|> token ID
        )

    generated_text = tokenizer.decode(outputs[:, input_ids["input_ids"].shape[-1]:][0], skip_special_tokens=False) #Keep special tokens
    # Splitting and cleaning the response
    generated_text = generated_text.split("<|end|>")[0].strip() #take only assistant
    return generated_text

# 3. Interactive Dialogue Loop
def interactive_session(initial_prompt):
    prompt = initial_prompt
    while True:
        response = generate_next_step(prompt)
        print(response)  # Print the model's response

        user_input = input("Your response (or type 'exit' to quit): ")  # Get user input

        if user_input.lower() == "exit":
            break

        # Append the model's response and the user's input to the prompt. Add special token
        prompt += response + "<|end|>\n<|user|>\n" + user_input + "<|end|>\n<|assistant|>\n"
        #print("current prompt", prompt)




The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Example Usage:
initial_prompt = "<|system|>\nYou are a helpful tutor for solving algebra.\n<|end|>\n\n<|user|>\nSimplify x^3 + 7x^2 - 2x^3.\n<|end|>\n\n<|assistant|>\n"
interactive_session(initial_prompt)

First, combine like terms x^3 and -2x^3 to get -x^3. That leaves us with -x^3 + 7x^2. Do you understand this step?


Your response (or type 'exit' to quit):  yest next step please


Great! Now the expression is -x^3 + 7x^2. Factor out the greatest common factor, which is x²: x²(-1 + 7). Simplify further: x²(6) = 6x². Do you understand this step?


Your response (or type 'exit' to quit):  no can you repeat again


Sure! The GCF here is x², so we factor it out: x²(-1 + 7). This simplifies to x²(6), or 6x². Does that make sense now?


Your response (or type 'exit' to quit):  i understand this problem can you give me new example for my practice


Excellent! Here’s another one: 4x^4y - 3x^2y^2 + 5xy^3. Start by identifying the GCF first; in this case, it's x^2y. Then divide each term by x^2y to simplify: 4x^4y/x^2y = 4x^2, -3x^2y^2/x^2y = -3y


Your response (or type 'exit' to quit):  exit
