In [23]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [24]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [25]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [26]:
from datasets import load_dataset

dataset = load_dataset(
    "text",
    data_files={"train": "phi3_finetune_format.txt"},
    sample_by="paragraph"  # ✅ treats blank-line-separated text as one example
)


In [27]:
print(dataset["train"][0])


{'text': "<|user|>\nRewrite the following text in Goutam's writing style.\nThe burgeoning landscape of AI tools presents an evolving challenge for educators and students alike, with definitive guidelines for their use still taking shape. Despite this fluidity, a broad consensus among professors underscores that reliance on these technologies without cultivating original thought or profound understanding ultimately undermines sustainable success.\n<|end|>\n<|assistant|>\nAs these tools are still emerging and it is tough to deterministically say what is the correct way to use them, however, most professors believe relying on AI tools without original thought or deep understanding will not lead to sustainable success.\n<|end|>"}


In [28]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")


In [29]:
def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=2048,  # adjust based on your GPU memory
    )

tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])


In [30]:
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset["train"], # Pass the 'train' split
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 80,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.001,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use TrackIO/WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/75 [00:00<?, ? examples/s]

In [31]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
6.148 GB of memory reserved.


In [32]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 75 | Num Epochs = 8 | Total steps = 80
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 29,884,416 of 3,850,963,968 (0.78% trained)


Step,Training Loss
1,3.5113
2,3.4103
3,2.9386
4,2.999
5,2.8943
6,2.7495
7,2.4875
8,2.2768
9,2.3752
10,1.9346


In [33]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

240.8339 seconds used for training.
4.01 minutes used for training.
Peak reserved memory = 6.148 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 41.707 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [44]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": ".\nRewrite the following text in Goutam's writing style.\nThe motivations driving participation in this sleepless rhythm are, without doubt, manifold..\n"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = False)
tokenizer.batch_decode(outputs)

["<|user|> .\nRewrite the following text in Goutam's writing style.\nThe motivations driving participation in this sleepless rhythm are, without doubt, manifold..\n<|end|><|assistant|> The reasons for the same are manifold.\n<|end|><|assistant|> The reasons for the same are manifold.\n<|end|><|assistant|> The reasons for the same are manifold.\n<|end|><|assistant|> The reasons for the same are manifold.\n<|end|><|assistant|> The reasons for the same are manifold.\n<|end|><|assistant|> The reasons for the same are manifold.\n"]

In [35]:
!pip install gradio -q

In [36]:
import gradio as gr

def rewrite_text(text):
    # Prefix the input text
    prefixed_text = f"Rewrite the following text in Goutam's writing style. {text}"

    # Prepare the messages in the correct format for the chat template
    messages = [
        {"from": "human", "value": prefixed_text},
    ]

    # Apply the chat template and tokenize
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,  # Must add for generation
        return_tensors="pt",
    ).to("cuda")

    # Generate the output from the model
    outputs = model.generate(input_ids=inputs, max_new_tokens=128, use_cache=True) # Increased max_new_tokens for potentially longer outputs

    # Decode the output and extract the assistant's response
    decoded_output = tokenizer.batch_decode(outputs)[0]

    # Find the start and end of the assistant's response
    start_tag = "<|assistant|>"
    end_tag = "<|end|>"
    assistant_start = decoded_output.find(start_tag)
    assistant_end = decoded_output.find(end_tag, assistant_start + len(start_tag))

    if assistant_start != -1 and assistant_end != -1:
        # Extract the text between the assistant tags
        assistant_response = decoded_output[assistant_start + len(start_tag):assistant_end].strip()
    else:
        # If the tags are not found, return the full decoded output (or handle as error)
        assistant_response = decoded_output

    return assistant_response

# Create the Gradio interface
iface = gr.Interface(
    fn=rewrite_text,
    inputs=gr.Textbox(lines=5, label="Enter text to rewrite"),
    outputs=gr.Textbox(lines= 7,label="Rewritten Text"),
    title="Text Rewriter in Goutam's Style",
    description="Enter text and the model will rewrite it in Goutam's writing style."
)

# Launch the interface
iface.launch(debug=True)

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://bbc5ad057d2c33dd2f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://bbc5ad057d2c33dd2f.gradio.live


