In [None]:
# 1. setup and installation 
!pip install unsloth_zoo==2025.6.8
!pip install git+https://github.com/unslothai/unsloth.git
!pip install bitsandbytes>=0.41.0
!pip install accelerate>=0.20.0
!pip install peft>=0.4.0
!pip install transformers>=4.32.0
!pip install trl>=0.4.7
!pip install protobuf==5.29.1 fsspec==2025.3.2 --upgrade --force-reinstall

In [None]:
# Cell 2: Model and LoRA Adapter Setup
from unsloth import FastLanguageModel
import torch

# --- Configuration ---
max_seq_length = 1024
dtype = torch.float16  # Use float16 for training
load_in_4bit = True
model_name = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"
lora_rank = 32

# --- Load Base Model and Tokenizer ---
print(f"Loading base model: {model_name}")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    device_map="auto",
    trust_remote_code=True,
)
print("✓ Model and tokenizer loaded successfully.")

# --- Configure LoRA for Fine-Tuning ---
print(f"\nApplying LoRA adapter with rank={lora_rank}...")
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_rank,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha=lora_rank,
    lora_dropout=0.1,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
    use_rslora=False,
    loftq_config=None,
)
print("✓ LoRA adapter configured. Model is ready for training.")

# --- Final Memory Check ---
if torch.cuda.is_available():
    print(f"\nGPU Memory Usage after setup:")
    allocated_gb = torch.cuda.memory_allocated(0) / 1024**3
    reserved_gb = torch.cuda.memory_reserved(0) / 1024**3
    print(f"  - Allocated: {allocated_gb:.2f} GB")
    print(f"  - Reserved:  {reserved_gb:.2f} GB")

In [None]:
# Cell 3: Data Cleaning and Preparation
import json
from datasets import Dataset
from sklearn.model_selection import train_test_split
import re

# --- System Prompt and Formatting ---
# This is the master prompt that defines the agent's behavior.
SYSTEM_PROMPT = """You are an automated banking customer service ticket analysis system. Your purpose is to parse a customer's request and structure it into a standardized JSON format for internal ticketing.

You must perform the following actions:
1. Carefully analyze the user's input to understand their intent and key details.
2. Populate all fields in the JSON object based only on the user's text. Do not invent information.
3. Adhere strictly to the defined categories for ticket_type, severity, and other categorical fields.
4. If the user's request is NOT related to banking or financial services (e.g., tech support for a personal computer, dating advice), you MUST reject it by responding with {"error": "Request is outside the banking support domain."}.
5. Your entire response must be ONLY the JSON object, with no conversational text, apologies, or explanations.

The required JSON format is:
{
    "ticket_type": "complaint" | "inquiry" | "assistance",
    "title": "A brief, descriptive summary of the user's issue.",
    "description": "A more detailed description based on the user's full input.",
    "severity": "low" | "medium" | "high" | "critical",
    "department_impacted": "The most relevant bank department.",
    "service_impacted": "The specific banking service affected.",
    "supporting_documents": "Attached documents and files by the customer",
    "preferred_communication": "preferred method to contact the customer"
}"""

# Standard Alpaca prompt template
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}"""

# --- Load and Clean the Dataset ---
print("Loading and cleaning 'banking_complaints_dataset1k.json'...")
file_path = 'banking_complaints_dataset1k.json'
cleaned_data = []

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for item in data:
        output_str = item.get("output", "")
        
        # Standardize non-JSON rejection messages into the correct JSON error format
        if not output_str.strip().startswith('{'):
            output_str = json.dumps({"error": "Request is outside the banking support domain."})
        else:
            # Clean and standardize the JSON content
            try:
                output_json = json.loads(output_str)
                
                # Fix 1: 'ticket_type' from "asking for assistance" to "assistance"
                if output_json.get("ticket_type") == "asking for assistance":
                    output_json["ticket_type"] = "assistance"
                
                # Fix 2: 'severity' from a list to a string (take the first element)
                if isinstance(output_json.get("severity"), list):
                    output_json["severity"] = output_json["severity"][0]
                
                # Fix 3: Remove non-standard keys like 'assistance_request'
                output_json.pop("assistance_request", None)
                
                # Remove random numeric values from title
                if "title" in output_json:
                    output_json["title"] = re.sub(r' - \$\d+', '', output_json["title"])

                output_str = json.dumps(output_json)

            except json.JSONDecodeError:
                # If it's malformed JSON, treat it as an error case
                output_str = json.dumps({"error": "Malformed JSON in original data."})

        cleaned_data.append({
            "instruction": SYSTEM_PROMPT,
            "input": item.get("input", ""),
            "output": output_str
        })
        
    print(f"✓ Successfully loaded and cleaned {len(cleaned_data)} records.")

except FileNotFoundError:
    print(f"✗ ERROR: Dataset file not found at '{file_path}'. Please ensure the file exists.")
    # Stop execution if the data isn't available
    raise

# --- Split and Format Data ---
train_data, temp_data = train_test_split(cleaned_data, test_size=0.2, random_state=42)
eval_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print("\nDataset splits:")
print(f"  - Train: {len(train_data)} samples")
print(f"  - Eval:  {len(eval_data)} samples")
print(f"  - Test:  {len(test_data)} samples")

EOS_TOKEN = tokenizer.eos_token

# Format datasets into final text prompts
train_texts = [alpaca_prompt.format(**item) + EOS_TOKEN for item in train_data]
eval_texts = [alpaca_prompt.format(**item) + EOS_TOKEN for item in eval_data]

# Create Hugging Face Dataset objects for the trainer
train_dataset = Dataset.from_dict({"text": train_texts})
eval_dataset = Dataset.from_dict({"text": eval_texts})

print("\n✓ Datasets prepared for training.")
print("\n--- Sample Formatted Training Prompt ---")
print(train_dataset[0]['text'][:1000] + "...")

In [None]:
# Cell 4: Training, Evaluation, and Inference with Validation & Repair Loop
from trl import SFTTrainer
from transformers import TrainingArguments, TrainerCallback
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import gc
import json # Ensure json is imported for validation

# --- Clear GPU Memory ---
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()

# --- Callback for Metrics Tracking ---
class EnhancedLossCallback(TrainerCallback):
    def __init__(self):
        self.train_losses, self.eval_losses = [], []
        self.learning_rates, self.steps, self.eval_steps = [], [], []
        
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            if "loss" in logs:
                self.train_losses.append(logs["loss"])
                self.steps.append(state.global_step)
            if "eval_loss" in logs:
                self.eval_losses.append(logs["eval_loss"])
                self.eval_steps.append(state.global_step)
            if "learning_rate" in logs:
                self.learning_rates.append(logs["learning_rate"])
                
    def plot_metrics(self, save_path="training_metrics.png"):
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
        ax1.plot(self.steps, self.train_losses, 'b-', label='Training Loss')
        ax1.plot(self.eval_steps, self.eval_losses, 'r-', label='Validation Loss', linewidth=2)
        ax1.set_ylabel('Loss'); ax1.set_title('Training & Validation Loss'); ax1.legend(); ax1.grid(True, alpha=0.3)
        
        ax2.plot(self.steps, self.learning_rates, 'g-')
        ax2.set_xlabel('Steps'); ax2.set_ylabel('Learning Rate'); ax2.set_title('Learning Rate Schedule'); ax2.grid(True, alpha=0.3)
        
        plt.tight_layout(); plt.savefig(save_path, dpi=300); plt.show()
        if self.eval_losses: print(f"Final Validation Loss: {self.eval_losses[-1]:.4f}")

loss_callback = EnhancedLossCallback()

# --- Training Arguments ---
training_args = TrainingArguments(
    output_dir="./banking_assistant_v1",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=4,
    learning_rate=1.5e-4,
    fp16=True,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_steps=50,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    optim="adamw_torch",
    weight_decay=0.01,
    seed=42,
)

# --- Initialize and Run Trainer ---
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    args=training_args,
    callbacks=[loss_callback],
    packing=False,
)

print("Starting training...")
trainer.train()
print("✓ Training complete.")

# --- Post-Training ---
print("\nPlotting metrics...")
loss_callback.plot_metrics()

print("\nSaving final LoRA adapters...")
model.save_pretrained("banking_assistant_final_adapters")
tokenizer.save_pretrained("banking_assistant_final_adapters")
print("✓ Adapters saved to 'banking_assistant_final_adapters'.")

# --- Final Evaluation and Inference with Validation & Repair ---
print("\n" + "="*50 + "\nEVALUATION AND INFERENCE\n" + "="*50)
FastLanguageModel.for_inference(model)

def generate_and_validate_json(user_input, max_retries=1):
    """
    Generates a JSON response from the model, validates it, and attempts to
    repair it if it's invalid.
    """
    prompt = alpaca_prompt.format(instruction=SYSTEM_PROMPT, input=user_input, output="")
    
    for attempt in range(max_retries + 1):
        inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=300, use_cache=True, pad_token_id=tokenizer.eos_token_id)
        response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        try:
            # Isolate the JSON part of the response
            json_str = response_text.split("### Response:")[1].strip()
            # Attempt to parse the JSON
            parsed_json = json.loads(json_str)
            return parsed_json # Success!
        except (json.JSONDecodeError, IndexError) as e:
            print(f"\n- Attempt {attempt + 1} failed: Invalid JSON generated.")
            print(f"  Error: {e}")
            print(f"  Faulty Output: {json_str}")
            
            if attempt >= max_retries:
                print("- Max retries reached. Returning failure.")
                return {"error": "Failed to generate valid JSON after multiple attempts."}
            
            # Construct a repair prompt
            repair_instruction = (
                "The following JSON is invalid. Please fix it. "
                f"The error was: {e}. Do not add any commentary, just provide the corrected JSON object."
            )
            prompt = alpaca_prompt.format(
                instruction=repair_instruction,
                input=json_str,
                output=""
            )
            print("- Retrying with a repair prompt...")

# --- Sample Predictions (Qualitative Check) ---
print("\n--- Running Sample Predictions with Validation & Repair ---")
test_cases = [
    "My debit card was declined at the ATM but I have sufficient funds.",
    "Can you help me set up automatic bill payments?",
    "Can you help me fix my laptop? It won't start.", # Out-of-domain
    "Someone made unauthorized purchases with my credit card in another country."
]

for case in test_cases:
    print(f"\n{'='*20}\nInput: {case}")
    validated_json = generate_and_validate_json(case)
    print("\nFinal Validated JSON Output:")
    print(json.dumps(validated_json, indent=2))

In [None]:
# Cell 5: Final Test Set Evaluation
import torch
import json
from tqdm import tqdm
import numpy as np
from unsloth import FastLanguageModel

print("="*50)
print("  FINAL MODEL EVALUATION ON HELD-OUT TEST SET")
print("="*50)

# --- 1. Load the Best Fine-Tuned Model ---
# This ensures we are using the model that performed best on the validation set during training.
# If you've already run the training cell, the 'model' variable is already the best one.
# If you are running this in a new session, uncomment the lines below:
# print("Loading best model from checkpoint...")
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name="banking_assistant_final_adapters", # Or your final checkpoint directory
#     max_seq_length=max_seq_length,
#     dtype=dtype,
#     load_in_4bit=load_in_4bit,
# )
# FastLanguageModel.for_inference(model)
# print("✓ Model loaded for inference.")


# --- 2. Quantitative Evaluation: Perplexity and Loss ---
# This measures how "surprised" the model is by the test data. Lower is better.

def calculate_test_perplexity(model, tokenizer, test_data, batch_size=4):
    """Calculates loss and perplexity on the test set."""
    model.eval()
    total_loss = 0
    
    # Prepare the test texts
    test_prompts = [alpaca_prompt.format(**item) + EOS_TOKEN for item in test_data]
    
    print(f"\nCalculating perplexity on {len(test_prompts)} test samples...")
    
    with torch.no_grad():
        for i in tqdm(range(0, len(test_prompts), batch_size), desc="Test Batches"):
            batch = test_prompts[i : i + batch_size]
            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_seq_length).to("cuda")
            
            # The labels are the input_ids themselves for language modeling loss
            outputs = model(**inputs, labels=inputs["input_ids"])
            total_loss += outputs.loss.item() * len(batch)
            
    avg_loss = total_loss / len(test_prompts)
    perplexity = np.exp(avg_loss)
    
    return avg_loss, perplexity

# Run the quantitative evaluation
test_loss, test_perplexity = calculate_test_perplexity(model, tokenizer, test_data)

print("\n--- Quantitative Metrics ---")
print(f"✅ Test Set Loss: {test_loss:.4f}")
print(f"✅ Test Set Perplexity: {test_perplexity:.4f}")
print("(Note: Perplexity measures how well the model predicts the next token. Lower is better.)")


# --- 3. Qualitative Evaluation: Accuracy of JSON Fields ---
# This measures how well the model performs on the actual task.

def evaluate_json_accuracy(model, tokenizer, test_data):
    """Generates responses for the test set and compares key JSON fields."""
    model.eval()
    
    correct_predictions = {
        "ticket_type": 0,
        "severity": 0,
        "department_impacted": 0,
        "is_valid_json": 0,
        "is_correct_error": 0,
    }
    total_banking_requests = 0
    total_error_requests = 0
    failed_generations = []

    print(f"\nCalculating JSON field accuracy on {len(test_data)} test samples...")

    for item in tqdm(test_data, desc="Evaluating JSON Accuracy"):
        # Prepare the prompt for inference (without the ground truth output)
        prompt = alpaca_prompt.format(instruction=item["instruction"], input=item["input"], output="")
        inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
        
        # Generate the model's prediction
        outputs = model.generate(**inputs, max_new_tokens=300, use_cache=True, pad_token_id=tokenizer.eos_token_id)
        response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract the JSON part of the response
        try:
            predicted_json_str = response_text.split("### Response:")[1].strip()
            predicted_json = json.loads(predicted_json_str)
            correct_predictions["is_valid_json"] += 1
        except (json.JSONDecodeError, IndexError):
            failed_generations.append({"input": item["input"], "output": response_text})
            continue # Skip to next item if JSON is invalid

        # Load the ground truth JSON
        ground_truth_json = json.loads(item["output"])

        # Compare fields
        if "error" in ground_truth_json:
            total_error_requests += 1
            if predicted_json.get("error") == ground_truth_json.get("error"):
                correct_predictions["is_correct_error"] += 1
        else:
            total_banking_requests += 1
            if predicted_json.get("ticket_type") == ground_truth_json.get("ticket_type"):
                correct_predictions["ticket_type"] += 1
            if predicted_json.get("severity") == ground_truth_json.get("severity"):
                correct_predictions["severity"] += 1
            if predicted_json.get("department_impacted") == ground_truth_json.get("department_impacted"):
                correct_predictions["department_impacted"] += 1
    
    # Calculate accuracies
    accuracies = {
        "JSON Validation Rate": (correct_predictions["is_valid_json"] / len(test_data)) * 100,
        "Ticket Type Accuracy": (correct_predictions["ticket_type"] / total_banking_requests) * 100 if total_banking_requests > 0 else 0,
        "Severity Accuracy": (correct_predictions["severity"] / total_banking_requests) * 100 if total_banking_requests > 0 else 0,
        "Department Accuracy": (correct_predictions["department_impacted"] / total_banking_requests) * 100 if total_banking_requests > 0 else 0,
        "Error Handling Accuracy": (correct_predictions["is_correct_error"] / total_error_requests) * 100 if total_error_requests > 0 else 0,
    }
    
    return accuracies, failed_generations

# Run the qualitative evaluation
accuracies, failed_generations = evaluate_json_accuracy(model, tokenizer, test_data)

print("\n--- Qualitative Metrics (Task-Specific Accuracy) ---")
for metric, value in accuracies.items():
    print(f"✅ {metric}: {value:.2f}%")

if failed_generations:
    print("\n--- Review of Failed Generations (Invalid JSON) ---")
    for i, failure in enumerate(failed_generations[:5]): # Print up to 5 failures
        print(f"\nFailure #{i+1}:")
        print(f"  Input: {failure['input']}")
        print(f"  Model Output: {failure['output']}")
else:
    print("\n✅ No invalid JSON generations were found in the test set!")

In [None]:
# 6 Simple cell to upload LoRA adapters to Hugging Face

from huggingface_hub import HfApi, login
import os

# --- 1. CONFIGURATION ---

# The local folder where your trained adapters are saved.
ADAPTER_PATH = "banking_assistant_final_adapters" 

# The name of your repository on the Hugging Face Hub.
HF_REPO_NAME = "LaythAbuJafar/Agent1_Adapters"

# Your Hugging Face token with 'write' permissions.
HF_TOKEN = "YOUR_HF_TOKEN_HERE" 

# --- 2. UPLOAD SCRIPT ---

print(f"--- Starting Upload to {HF_REPO_NAME} ---")

# Check if the local adapter path exists before trying to upload
if not os.path.isdir(ADAPTER_PATH):
    print(f"❌ ERROR: The directory '{ADAPTER_PATH}' was not found.")
    print("Please make sure your training completed and saved the adapters to that folder.")
else:
    try:
        # Step 1: Login to Hugging Face
        print("Logging in...")
        login(token=HF_TOKEN)
        print("✓ Login successful.")

        # Step 2: Upload the folder
        print(f"Uploading files from '{ADAPTER_PATH}'...")
        api = HfApi()
        api.upload_folder(
            folder_path=ADAPTER_PATH,
            repo_id=HF_REPO_NAME,
            repo_type="model",
            commit_message="Upload fine-tuned LoRA adapters"
        )
        print(f"\n✅ SUCCESS! Your adapters have been uploaded to:")
        print(f"https://huggingface.co/{HF_REPO_NAME}")

    except Exception as e:
        print(f"\n❌ An error occurred during the upload process: {e}")

In [None]:
# 7 Simple cell to merge LoRA adapters and upload the full model to Hugging Face

import torch
import os
from unsloth import FastLanguageModel
from huggingface_hub import HfApi, login, create_repo

# --- 1. CONFIGURATION ---

# The local folder where your trained adapters are saved.
ADAPTER_PATH = "banking_assistant_final_adapters"

# The local folder where the new merged model will be saved.
MERGED_MODEL_PATH = "banking_assistant_merged_final"

# The name of your repository on the Hugging Face Hub for the MERGED model.
HF_REPO_NAME = "LaythAbuJafar/Agent1_Merged"

# Your Hugging Face token with 'write' permissions.
HF_TOKEN = "YOUR_HF_TOKEN_HERE"

# --- 2. MERGE AND UPLOAD SCRIPT ---

print("--- Starting Model Merge and Upload Process ---")

# Check if the adapter path exists
if not os.path.isdir(ADAPTER_PATH):
    print(f"❌ ERROR: The adapter directory '{ADAPTER_PATH}' was not found.")
    print("Please make sure your training completed and saved the adapters to that folder.")
else:
    try:
        # Step 1: Load the base model and apply the adapters
        print("\nStep 1: Loading base model and applying LoRA adapters...")
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=ADAPTER_PATH,  # Load the adapters directly
            max_seq_length=1024,
            dtype=torch.float16,
            load_in_4bit=True,
        )
        print("✓ Model and adapters loaded successfully.")

        # Step 2: Merge the adapters into the model
        print(f"\nStep 2: Merging adapters into the model and saving to '{MERGED_MODEL_PATH}'...")
        # This saves the full model in 16-bit precision (float16)
        model.save_pretrained_merged(MERGED_MODEL_PATH, tokenizer, save_method="merged_16bit")
        print("✓ Model merged and saved locally.")

        # Step 3: Login to Hugging Face
        print("\nStep 3: Logging in to Hugging Face...")
        login(token=HF_TOKEN)
        print("✓ Login successful.")

        # Step 4: Create the repository and upload the merged model
        print(f"\nStep 4: Uploading the merged model to '{HF_REPO_NAME}'...")
        print("⚠️ This will upload a large model (~15 GB). This may take a while.")
        
        api = HfApi()
        # Create the repo if it doesn't exist
        create_repo(HF_REPO_NAME, repo_type="model", exist_ok=True)
        
        # Upload the entire merged model folder
        api.upload_folder(
            folder_path=MERGED_MODEL_PATH,
            repo_id=HF_REPO_NAME,
            repo_type="model",
            commit_message="Upload merged 16-bit model"
        )
        
        print(f"\n✅ SUCCESS! Your merged model has been uploaded to:")
        print(f"https://huggingface.co/{HF_REPO_NAME}")

    except Exception as e:
        print(f"\n❌ An error occurred during the process: {e}")

In [2]:
# 8 . Quantize 
import subprocess
import os

# === CONFIGURATION ===
model_path = r"Agent1_Merged.gguf"
quant_type = "q4_K_M"  # Options: q4_0, q5_1, q8_0, etc.
llama_cpp_bin_dir = r"llama.cpp/build/bin"
output_path = os.path.splitext(model_path)[0] + f".{quant_type}.gguf"

# === MAIN LOGIC ===
quantize_exe = os.path.join(llama_cpp_bin_dir, "llama-quantize")
if not os.path.exists(quantize_exe):
    raise FileNotFoundError(f"llama-quantize.exe not found at: {quantize_exe}")
if not os.path.exists(model_path):
    raise FileNotFoundError(f"Model file not found at: {model_path}")

cmd = [quantize_exe, model_path, output_path, quant_type]
print(f"Running: {' '.join(cmd)}")

result = subprocess.run(cmd)

if result.returncode == 0:
    print(f"✅ Success! Quantized model saved to:\n{output_path}")
else:
    print(f"❌ Quantization failed with code {result.returncode}")

Running: llama.cpp/build/bin/llama-quantize Agent1_Merged.gguf Agent1_Merged.q4_K_M.gguf q4_K_M


main: build = 5893 (0f4c6ec0)
main: built with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
main: quantizing 'Agent1_Merged.gguf' to 'Agent1_Merged.q4_K_M.gguf' as Q4_K_M
llama_model_loader: loaded meta data with 23 key-value pairs and 339 tensors from Agent1_Merged.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Agent1_Merged
llama_model_loader: - kv   3:                         general.size_label str              = 7.6B
llama_model_loader: - kv   4:                          qwen2.block_count u32              = 28
llama_model_loader: - kv   5:                       qwen2.context_length u32              = 


main: quantize time = 162722.84 ms
main:    total time = 162722.84 ms
✅ Success! Quantized model saved to:
Agent1_Merged.q4_K_M.gguf


In [4]:
# 9 upload quantized model to hf
from huggingface_hub import HfApi

api = HfApi()

local_path = r"Agent1_Merged.q4_K_M.gguf"
repo_id = "LaythAbuJafar/Agent1_GGUF_Q"
target_path_in_repo = "Agent1_Merged.q4_K_M.gguf"

api.upload_file(
    path_or_fileobj=local_path,
    path_in_repo=target_path_in_repo,
    repo_id=repo_id,
    repo_type="model"
)

print("✅ Upload complete!")

Agent1_Merged.q4_K_M.gguf:   0%|          | 0.00/4.68G [00:00<?, ?B/s]

✅ Upload complete!
