# Fine-Tuning Phi-3-mini for Writing Tools

Datasets: grammarly/coedit (proofreading/editing), cnn_dailymail (summarization), PAWS (paraphrasing), JFLEG (grammar correction).

In [None]:
!pip install flash-attn --no-build-isolation
!pip install -q transformers datasets peft trl bitsandbytes accelerate torch

Collecting flash-attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/8.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/8.4 MB[0m [31m102.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/8.4 MB[0m [31m55.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━[0m [32m7.1/8.4 MB[0m [31m68.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m8.4/8.4 MB[0m [31m73.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [

In [3]:
import torch
from datasets import load_dataset, concatenate_datasets
from peft import LoraConfig, TaskType, get_peft_model
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
)
from trl import SFTTrainer, SFTConfig
import warnings
warnings.filterwarnings("ignore")

# Config
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
OUTPUT_DIR = "./phi3-writing-finetuned"
NUM_EPOCHS = 2  #  3 for better performance.
BATCH_SIZE = 8
LEARNING_RATE = 2e-4
NUM_SAMPLES = {
    "coedit": 10000,
    "cnn_dailymail": 5000,
}
MAX_SEQ_LENGTH = 512

# Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2"
)

# LoRA Config (efficient fine-tuning)
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=32,  # Rank
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["qkv_proj"]  # Phi-3 specific
)
model = get_peft_model(model, peft_config)

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [10]:
#Phi-3 instruct template: <|user|>Prompt<|end|>\n<|assistant|>Response<|end|>
def apply_chat_template(examples):
    texts = []
    for prompt, response in zip(examples["prompt"], examples["response"]):
        text = f"<|user|>\n{prompt}<|end|>\n<|assistant|>\n{response}<|end|>\n"
        texts.append(text)
    return {"text": texts}

def format_as_messages(examples):
    messages = []
    for prompt, response in zip(examples["prompt"], examples["response"]):
        messages.append([{"role": "user", "content": prompt}, {"role": "assistant", "content": response}])
    return {"messages": messages}

# CoEdit: Proofreading/Editing
print("Loading CoEdit...")
coedit = load_dataset("grammarly/coedit", split="train").select(range(NUM_SAMPLES["coedit"]))
coedit = coedit.map(lambda x: {
    "prompt": x['src'],
    "response": x['tgt']
})
coedit = coedit.map(format_as_messages, batched=True, remove_columns=coedit.column_names)

# CNN/DailyMail: Summarization
print("Loading CNN/DailyMail...")
cnn = load_dataset("cnn_dailymail", "3.0.0", split="train").select(range(NUM_SAMPLES["cnn_dailymail"]))
cnn = cnn.map(lambda x: {
    "prompt": f"Summarize the following article concisely: {x['article'][:1000]}",
    "response": x["highlights"]
})
cnn = cnn.map(format_as_messages, batched=True, remove_columns=cnn.column_names)

# JFLEG for Grammar
print("Loading JFLEG...")
jfleg = load_dataset("jfleg", split="all")
jfleg = jfleg.select(range(min(3000, len(jfleg))))
jfleg = jfleg.map(lambda x: {
    "prompt": f"Correct the grammar and fluency in this sentence: {x['sentence']}",
    "response": x['corrections'][0]
})
jfleg = jfleg.map(format_as_messages, batched=True, remove_columns=jfleg.column_names)
NUM_SAMPLES["jfleg"] = len(jfleg)

# PAWS for Paraphrasing
print("Loading PAWS...")
paws = load_dataset("paws", "labeled_final", split="train").filter(lambda x: x["label"] == 1).select(range(5000))
paws = paws.map(lambda x: {
    "prompt": f"Paraphrase this sentence to mean the same: {x['sentence1']}",
    "response": x['sentence2']
})
paws = paws.map(format_as_messages, batched=True, remove_columns=paws.column_names)
# Combine all datasets
full_dataset = concatenate_datasets([coedit, cnn, jfleg, paws])
full_dataset = full_dataset.train_test_split(test_size=0.1)
train_ds = full_dataset["train"]
eval_ds = full_dataset["test"]

print(f"Total training samples: {len(train_ds)}")

Loading CoEdit...


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Loading CNN/DailyMail...


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Loading JFLEG...


Map:   0%|          | 0/1503 [00:00<?, ? examples/s]

Loading PAWS...


labeled_final/train-00000-of-00001.parqu(…):   0%|          | 0.00/8.43M [00:00<?, ?B/s]

labeled_final/test-00000-of-00001.parque(…):   0%|          | 0.00/1.24M [00:00<?, ?B/s]

labeled_final/validation-00000-of-00001.(…):   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/49401 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Total training samples: 19352


In [12]:
# Training Args
tokenizer.chat_template = """{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{% generation %}{{ '<|assistant|>\n' + message['content'] + '<|end|>\n' }}{% endgeneration %}{% endif %}{% endfor %}"""

tokenizer.save_pretrained(OUTPUT_DIR)
args = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=2,
    optim="paged_adamw_8bit",
    learning_rate=LEARNING_RATE,
    fp16=False,
    bf16=True,
    logging_steps=10,
    save_steps=500,
    eval_steps=500,
    eval_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=None,
    remove_unused_columns=False,
    dataset_text_field="messages",
    max_length=MAX_SEQ_LENGTH,
    assistant_only_loss=True,
    packing=False
)


# SFT Trainer
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    args=args,
    peft_config=peft_config,
)

# Train
trainer.train()

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Fine-tuned model saved to {OUTPUT_DIR}")
print("To test: Use pipeline('text-generation', model=OUTPUT_DIR, tokenizer=tokenizer)")
print("Example prompt: tokenizer.apply_chat_template([{'role': 'user', 'content': 'Proofread: This is a bad sentence.'}], tokenize=False)")

Tokenizing train dataset:   0%|          | 0/19352 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/19352 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2151 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2151 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mclass2t24[0m ([33mclass2t24-self[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Casting fp32 inputs back to torch.bfloat16 for flash-attn compatibility.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
500,0.7376,0.756714,1.770692,1022499.0,0.82018
1000,0.7344,0.745302,1.760112,2054407.0,0.8218
1500,0.7248,0.740097,1.691921,3074322.0,0.823279
2000,0.6542,0.736078,1.694771,4090536.0,0.823727


Fine-tuned model saved to ./phi3-writing-finetuned
To test: Use pipeline('text-generation', model=OUTPUT_DIR, tokenizer=tokenizer)
Example prompt: tokenizer.apply_chat_template([{'role': 'user', 'content': 'Proofread: This is a bad sentence.'}], tokenize=False)


In [None]:
# testing

# Load the fine-tuned model for testing
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch

# Load base model and tokenizer
base_model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=False
)

# Load and merge LoRA adapter
model = PeftModel.from_pretrained(model, "./phi3-writing-finetuned")
model = model.merge_and_unload()

# Pipeline for generation
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

gen_config = {
    "max_new_tokens": 128,
    "temperature": 0.6,
    "do_sample": True,
    "top_p": 0.7,
    "pad_token_id": tokenizer.eos_token_id
}


def generate_response(prompt_messages, few_shot=None, gen_config=gen_config):
    if few_shot:
        prompt_messages = [{"role": "system", "content": few_shot}] + prompt_messages
    prompt_text = tokenizer.apply_chat_template(prompt_messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_config)
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return response.strip()

print("=== Testing Fine-Tuned Phi-3 Writing Model ===\n")

# 1. Proofreading/Editing (CoEdit)
print("1. PROOFREADING/EDITING:")
user_prompt = "Proofread and rewrite this text for clarity, grammar, and fluency: The quick brown fox jumps over lazy dog."
messages = [{"role": "user", "content": user_prompt}]
response = generate_response(messages)
print(f"Input: {user_prompt}")
print(f"Output: {response}\n")

# 2. Summarization (CNN/DailyMail-style)
print("2. SUMMARIZATION:")
user_prompt = "Summarize the following article concisely: In a surprising turn of events, scientists discovered a new species of fish in the Pacific Ocean. It has bioluminescent features and lives in deep waters. Researchers believe it could hold clues to ocean mysteries."
messages = [{"role": "user", "content": user_prompt}]
response = generate_response(messages)
print(f"Input: {user_prompt}")
print(f"Output: {response}\n")

# 3. Grammar Check (CoLA-style)
print("3. GRAMMAR CHECK:")
user_prompt = "Check grammar and correct if needed: I go to store yesterday and buyed apples."
messages = [{"role": "user", "content": user_prompt}]
response = generate_response(messages)
print(f"Input: {user_prompt}")
print(f"Output: {response}\n")

# 4. Paraphrasing (MRPC-style)
print("4. PARAPHRASING:")
user_prompt = "Paraphrase this sentence while keeping the meaning: The weather is nice today."
messages = [{"role": "user", "content": user_prompt}]
response = generate_response(messages)
print(f"Input: {user_prompt}")
print(f"Output: {response}\n")


from datasets import load_dataset
test_coedit = load_dataset("grammarly/coedit", split="validation").select(range(3))
for i, ex in enumerate(test_coedit):
    prompt = ex['src']
    expected = ex['tgt']
    messages = [{"role": "user", "content": prompt}]
    response = generate_response(messages)
    print(f"Sample {i+1}:")
    print(f"Prompt: {prompt[:100]}...")
    print(f"Generated: {response}")
    print(f"Expected: {expected[:100]}...")
    print("---")

print("\n=== Testing Complete! ===")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0


=== Testing Fine-Tuned Phi-3 Writing Model ===

1. PROOFREADING/EDITING:
Input: Proofread and rewrite this text for clarity, grammar, and fluency: The quick brown fox jumps over lazy dog.
Output: The quick brown fox jumps over the lazy dog.

2. SUMMARIZATION:
Input: Summarize the following article concisely: In a surprising turn of events, scientists discovered a new species of fish in the Pacific Ocean. It has bioluminescent features and lives in deep waters. Researchers believe it could hold clues to ocean mysteries.
Output: Scientists discovered a new species of fish in the Pacific Ocean .
It has bioluminescent features and lives in deep waters .
Researchers believe it could hold clues to ocean mysteries .

3. GRAMMAR CHECK:
Input: Check grammar and correct if needed: I go to store yesterday and buyed apples.
Output: I went to the store yesterday and bought apples.

4. PARAPHRASING:
Input: Paraphrase this sentence while keeping the meaning: The weather is nice today.
Output: The wea

In [19]:
!pip install rouge_score evaluate

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=d95b1ccc8e0348e600ce2e3c7eec595fe0dbe9b545ff21f4a28480930d6a1a0f
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [20]:
import evaluate
from datasets import load_dataset

# Load metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")


test_coedit = load_dataset("grammarly/coedit", split="validation").select(range(10))
predictions = []
references = []

for ex in test_coedit:
    prompt = ex['src']
    expected = ex['tgt']
    messages = [{"role": "user", "content": prompt}]
    response = generate_response(messages)
    predictions.append(response)
    references.append(expected)

rouge_results = rouge.compute(predictions=predictions, references=references)
bleu_results = bleu.compute(predictions=predictions, references=references)

print("ROUGE Scores:", rouge_results)
print("BLEU Score:", bleu_results)

for i in range(len(predictions)):
    print(f"Sample {i+1}:")
    print(f"Generated: {predictions[i][:100]}...")
    print(f"Reference: {references[i][:100]}...")
    print("---")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

ROUGE Scores: {'rouge1': np.float64(0.7885944906590401), 'rouge2': np.float64(0.6238199303480565), 'rougeL': np.float64(0.7478417327096565), 'rougeLsum': np.float64(0.7482298571073511)}
BLEU Score: {'bleu': 0.590505051645395, 'precisions': [0.8333333333333334, 0.6637554585152838, 0.5290178571428571, 0.4155251141552511], 'brevity_penalty': 1.0, 'length_ratio': 1.0151843817787418, 'translation_length': 468, 'reference_length': 461}
Sample 1:
Generated: Why are you arresting me?...
Reference: Why am I being arrested?...
---
Sample 2:
Generated: First of all, from you just read to find in the poems or novel what well-known critics have already ...
Reference: First of all, if you read just to find in the poem or novel what well-known critics have already fou...
---
Sample 3:
Generated: Their research showed that before Hurricane Sandy, " only about 50 percent of residents used the eme...
Reference: Their research showed that before Hurricane Sandy, only " about 50 percent of residents used 

In [25]:
#Quantizations
from transformers import BitsAndBytesConfig
quant_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained("./phi3-writing-finetuned", quantization_config=quant_config)
model.save_pretrained("./phi3-quantized_q4_0_v1")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [32]:
!zip -r /content/phi_3_finetuned_fullmodel.zip /content/llama.cpp/phi3-merged

  adding: content/llama.cpp/phi3-merged/ (stored 0%)
  adding: content/llama.cpp/phi3-merged/chat_template.jinja (deflated 60%)
  adding: content/llama.cpp/phi3-merged/tokenizer.model (deflated 55%)
  adding: content/llama.cpp/phi3-merged/config.json (deflated 52%)
  adding: content/llama.cpp/phi3-merged/configuration_phi3.py (deflated 72%)
  adding: content/llama.cpp/phi3-merged/model.safetensors.index.json (deflated 95%)
  adding: content/llama.cpp/phi3-merged/generation_config.json (deflated 33%)
  adding: content/llama.cpp/phi3-merged/model-00001-of-00002.safetensors


zip error: Interrupted (aborting)


In [None]:
!apt update -qq && apt install -qq build-essential git cmake
!pip install huggingface-hub torch transformers

!git clone https://github.com/ggerganov/llama.cpp.git
%cd llama.cpp
!make clean && make -j libllama.so

46 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)[0m
build-essential is already the newest version (12.9ubuntu3).
cmake is already the newest version (3.22.1-1ubuntu1.22.04.2).
git is already the newest version (1:2.34.1-1ubuntu1.15).
0 upgraded, 0 newly installed, 0 to remove and 46 not upgraded.
Cloning into 'llama.cpp'...
remote: Enumerating objects: 67212, done.[K
remote: Counting objects: 100% (305/305), done.[K
remote: Compressing objects: 100% (179/179), done.[K
remote: Total 67212 (delta 246), reused 126 (delta 126), pack-reused 66907 (from 4)[K
Receiving objects: 100% (67212/67212), 194.15 MiB | 16.86 MiB/s, done.
Resolving deltas: 100% (48819/48819), done.
/content/llama.cpp
Makefile:6: *** Build system changed:
 The Makefile build has been replaced by CM

configuration_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Merged model saved to ./phi3-merged
ERROR:hf-to-gguf:Error: ../phi3-merged is not a directory
GGUF exported: phi3-writing-Q4_K_M.gguf (~2GB)
Download: Use Colab's Files tab → right-click → Download
/bin/bash: -c: line 1: syntax error near unexpected token `('
/bin/bash: -c: line 1: `{CMAKE_ARGS="-DLLAMA_CUBLAS=on" if torch.cuda.is_available() else ""} make -j  # Rebuild with CUDA if GPU'
/bin/bash: line 1: ./llama-cli: No such file or directory


In [None]:
 #Merge LoRA
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

base_model = "microsoft/Phi-3-mini-4k-instruct"
merged_dir = "./phi3-merged/"

# Load base + LoRA → Merge
model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.bfloat16, trust_remote_code=True)
model = PeftModel.from_pretrained(model, "../phi3-writing-finetuned")
model = model.merge_and_unload()

# Save merged HF model
model.save_pretrained(merged_dir, safe_serialization=True)
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.save_pretrained(merged_dir)

print(f"Merged model saved to {merged_dir}")

# Convert to GGUF
!python convert_hf_to_gguf.py /content/llama.cpp/phi3-merged --outfile phi3-writing-Q8.gguf --outtype q8_0

INFO:hf-to-gguf:Loading model: phi3-merged
INFO:hf-to-gguf:Model architecture: Phi3ForCausalLM
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: indexing model part 'model-00001-of-00002.safetensors'
INFO:hf-to-gguf:gguf: indexing model part 'model-00002-of-00002.safetensors'
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:token_embd.weight,         torch.bfloat16 --> Q8_0, shape = {3072, 32064}
INFO:hf-to-gguf:blk.0.attn_norm.weight,    torch.bfloat16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.ffn_down.weight,     torch.bfloat16 --> Q8_0, shape = {8192, 3072}
INFO:hf-to-gguf:blk.0.ffn_up.weight,       torch.bfloat16 --> Q8_0, shape = {3072, 16384}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,     torch.bfloat16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.attn_output.weight,  torch.bfloat16 --> Q8_0, shape = {3072, 3072}
INFO:hf-to-gguf:blk.0.attn_qkv.weight,    

In [33]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [36]:
!cp -r "/content/llama.cpp/phi3-merged" "/content/drive/MyDrive"

In [3]:
# gradio from gguf
from llama_cpp import Llama
import torch

# Load GGUF
model_path = "D:/ttest/Project/phi3-writing-Q8.gguf"
llm = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=4,
    n_gpu_layers=0,  # 0 for CPU-only; >0 if CUDA
    verbose=False
)

def generate_response(prompt_messages, max_tokens=128, temperature=0.1):
    formatted_prompt = ""
    for msg in prompt_messages:
        role = msg["role"]
        content = msg["content"]
        if role == "user":
            formatted_prompt += f"<|user|>\n{content}<|end|>\n<|assistant|>\n"
        elif role == "assistant":
            formatted_prompt += f"<|assistant|>\n{content}<|end|>\n"
    # Add generation prompt
    formatted_prompt += "<|assistant|>" 
    
    # Generate
    output = llm(
        formatted_prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=0.9,
        stop=["<|end|>", "<|user|>"],
        echo=False
    )
    response = output['choices'][0]['text'].strip()
    return response

# Step 4: Gradio UI
import gradio as gr

def demo_interface(input_text, main_option, tone_option):
    if main_option == "Tones":
        prefix = f"Rewrite this text in a {tone_option} tone:"
    else:
        task_prompts = {
            "Rewrite": "Rewrite this text:",
            "Summarize": "Summarize this text concisely:",
            "Proofread": "Proofread and edit this text for clarity:",
            "Convert to List": "Convert this text to a bullet-point list:"
        }
        prefix = task_prompts.get(main_option, "Process this text:")
        if tone_option:
            prefix += f" in a {tone_option} tone"

    full_prompt = f"{prefix} {input_text}"

    messages = [{"role": "user", "content": full_prompt}]
    response = generate_response(messages)
    return response

# UI Components
main_options = ["Rewrite", "Summarize", "Proofread", "Convert to List", "Tones"]
tones = ["concise", "professional", "friendly", "longer"]

with gr.Blocks(title="Writing Assistant") as iface:
    gr.Markdown("# Writing Assistant")
    gr.Markdown("Select a main option. For 'Tones', choose a sub-tone to apply. Powered by quantized Phi-3.")

    with gr.Row():
        input_text = gr.Textbox(label="Input Text", placeholder="Enter text to process...", lines=3)
        main_dropdown = gr.Dropdown(choices=main_options, label="Main Option", value="Rewrite")

    with gr.Row():
        tone_dropdown = gr.Dropdown(choices=tones, label="Tone (Sub-Option)", value="professional", visible=False)
        submit_btn = gr.Button("Process", variant="primary")

    output_text = gr.Textbox(label="Output", lines=5)

    def toggle_tone_visibility(selected_option):
        return gr.update(visible=(selected_option == "Tones"))

    main_dropdown.change(
        fn=toggle_tone_visibility,
        inputs=main_dropdown,
        outputs=tone_dropdown
    )

    submit_btn.click(
        fn=demo_interface,
        inputs=[input_text, main_dropdown, tone_dropdown],
        outputs=output_text
    )

iface.launch(share=True, debug=True)

llama_context: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


* Running on local URL:  http://127.0.0.1:7860

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


Keyboard interruption in main thread... closing server.


