In [None]:
# 1. setup and installation 
!pip install unsloth_zoo==2025.6.8
!pip install git+https://github.com/unslothai/unsloth.git
!pip install bitsandbytes>=0.41.0
!pip install accelerate>=0.20.0
!pip install peft>=0.4.0
!pip install transformers>=4.32.0
!pip install trl>=0.4.7
!pip install protobuf==5.29.1 fsspec==2025.3.2 --upgrade --force-reinstall


In [None]:
# 2. Load Model and Tokenizer
from unsloth import FastLanguageModel
import torch

# Model and tokenizer configuration
max_seq_length = 2048
dtype = None # Auto-detect
load_in_4bit = True # Use 4-bit quantization

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)


In [None]:
# Fixed Cell 3: Data Preparation with System Prompt Integration

# Step 1: Set environment variable BEFORE any imports
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Step 2: Import required libraries
import torch
import gc
import json
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Step 3: Check GPU status
print("GPU Status Check:")
if torch.cuda.is_available():
    print(f"✓ CUDA available")
    print(f"✓ GPU: {torch.cuda.get_device_name(0)}")
    print(f"✓ Initial memory: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
else:
    print("✗ No GPU available!")
    
# Step 4: Import Unsloth
from unsloth import FastLanguageModel

# Step 5: Load model with conservative settings
print("\nLoading model with conservative settings...")
max_seq_length = 1024  # Reduced for safety
dtype = torch.float16  # Explicit dtype
load_in_4bit = True

try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Qwen2.5-7B-Instruct",
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        device_map="sequential",  # More conservative than "auto"
        trust_remote_code=True,
    )
    print("✓ Model loaded successfully!")
except Exception as e:
    print(f"✗ Error loading model: {e}")
    print("\nTrying alternative: base Qwen model...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="Qwen/Qwen2.5-7B-Instruct",
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )

# Define your SYSTEM PROMPT
SYSTEM_PROMPT = """You are an automated banking customer service ticket analysis system. Your purpose is to parse a customer's request and structure it into a standardized JSON format for internal ticketing.

You must perform the following actions:
1. Carefully analyze the user's input to understand their intent and key details.
2. Populate all fields in the JSON object based only on the user's text. Do not invent information.
3. Adhere strictly to the defined categories for ticket_type, severity, and other categorical fields.
4. If the user's request is NOT related to banking or financial services (e.g., tech support for a personal computer, dating advice), you MUST reject it by responding with {"error": "Request is outside the banking support domain."}.
5. Your entire response must be ONLY the JSON object, with no conversational text, apologies, or explanations.

The required JSON format is:
{
    "ticket_type": "complaint" | "inquiry" | "assistance",
    "title": "A brief, descriptive summary of the user's issue.",
    "description": "A more detailed description based on the user's full input.",
    "severity": "low" | "medium" | "high" | "critical",
    "department_impacted": "The most relevant bank department.",
    "service_impacted": "The specific banking service affected.",
    "supporting_documents": "Attached documents and files by the customer",
    "preferred_communication": "preferred method to contact the customer"
}"""

# Step 6: Define Standard Alpaca prompt with proper formatting
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}"""

# Step 7: Load dataset with error handling and create larger sample if needed
print("\nLoading dataset...")
file_path = 'banking_complaints_dataset1k.json'

try:
    with open(file_path, 'r') as f:
        data = json.load(f)
    print(f"✓ Loaded {len(data)} records from {file_path}")
except:
    print("✗ File not found. Creating expanded sample dataset...")
    # Create more diverse sample data for better training in standard Alpaca format
    sample_data = [
        {
            "instruction": SYSTEM_PROMPT,  # Use the system prompt
            "input": "My credit card was charged twice for the same purchase at Target yesterday.",
            "output": json.dumps({
                "ticket_type": "complaint", 
                "title": "Duplicate credit card charge", 
                "description": "Customer was charged twice for the same purchase at Target", 
                "severity": "high", 
                "department_impacted": "Credit Card Services", 
                "service_impacted": "Credit Card", 
                "supporting_documents": "Receipt, credit card statement", 
                "preferred_communication": "Phone"
            })
        },
        {
            "instruction": SYSTEM_PROMPT,
            "input": "I need help understanding the fees on my mortgage statement.",
            "output": json.dumps({
                "ticket_type": "inquiry", 
                "title": "Mortgage statement clarification", 
                "description": "Customer needs explanation of mortgage statement fees", 
                "severity": "low", 
                "department_impacted": "Loans", 
                "service_impacted": "Mortgage", 
                "supporting_documents": "Mortgage statement", 
                "preferred_communication": "Email"
            })
        },
        {
            "instruction": SYSTEM_PROMPT,
            "input": "Someone used my debit card without permission and made several purchases.",
            "output": json.dumps({
                "ticket_type": "complaint", 
                "title": "Unauthorized debit card transactions", 
                "description": "Customer reports fraudulent debit card activity", 
                "severity": "critical", 
                "department_impacted": "Fraud Prevention", 
                "service_impacted": "Debit Card", 
                "supporting_documents": "Bank statements, police report", 
                "preferred_communication": "Phone"
            })
        },
        {
            "instruction": SYSTEM_PROMPT,
            "input": "I want to know about your business loan rates and application process.",
            "output": json.dumps({
                "ticket_type": "inquiry", 
                "title": "Business loan information request", 
                "description": "Customer inquiring about business loan rates and application process", 
                "severity": "low", 
                "department_impacted": "Loans", 
                "service_impacted": "Business Loans", 
                "supporting_documents": "None", 
                "preferred_communication": "Email"
            })
        },
        {
            "instruction": SYSTEM_PROMPT,
            "input": "My mobile app keeps crashing when I try to transfer money.",
            "output": json.dumps({
                "ticket_type": "assistance", 
                "title": "Mobile app technical issue", 
                "description": "Customer experiencing app crashes during money transfers", 
                "severity": "medium", 
                "department_impacted": "IT Support", 
                "service_impacted": "Mobile App", 
                "supporting_documents": "Screenshot of error", 
                "preferred_communication": "Chat"
            })
        },
        {
            "instruction": SYSTEM_PROMPT,
            "input": "I was charged an overdraft fee but I had money in my savings account.",
            "output": json.dumps({
                "ticket_type": "complaint", 
                "title": "Inappropriate overdraft fee", 
                "description": "Customer charged overdraft fee despite having funds in savings", 
                "severity": "medium", 
                "department_impacted": "Customer Service", 
                "service_impacted": "Overdraft Protection", 
                "supporting_documents": "Account statements", 
                "preferred_communication": "Phone"
            })
        },
        {
            "instruction": SYSTEM_PROMPT,
            "input": "Can you fix my computer? It won't turn on.",
            "output": json.dumps({
                "error": "Request is outside the banking support domain."
            })
        },
        {
            "instruction": SYSTEM_PROMPT,
            "input": "I need dating advice.",
            "output": json.dumps({
                "error": "Request is outside the banking support domain."
            })
        }
    ]
    
    # Replicate to create more training data
    data = sample_data * 50  # 400 samples total
    print(f"✓ Created {len(data)} sample records for testing")

# Step 8: Format dataset properly for standard Alpaca format
formatted_dataset = []
for item in data:
    if "instruction" in item and "input" in item and "output" in item:
        # Ensure instruction uses the system prompt
        instruction = SYSTEM_PROMPT if item["instruction"] != SYSTEM_PROMPT else item["instruction"]
        
        # Ensure output is a JSON string
        if isinstance(item["output"], dict):
            output_str = json.dumps(item["output"])
        else:
            output_str = item["output"]
        
        formatted_dataset.append({
            "instruction": instruction,
            "input": item["input"],
            "output": output_str
        })
    elif "input" in item and "output" in item:
        # Handle cases where instruction is missing - add system prompt
        if isinstance(item["output"], dict):
            output_str = json.dumps(item["output"])
        else:
            output_str = item["output"]
            
        formatted_dataset.append({
            "instruction": SYSTEM_PROMPT,
            "input": item["input"],
            "output": output_str
        })

print(f"✓ Formatted {len(formatted_dataset)} samples in standard Alpaca format")

# Step 9: Split dataset and save to files
train_data, temp_data = train_test_split(formatted_dataset, test_size=0.3, random_state=42)
eval_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"\nDataset splits:")
print(f"✓ Train: {len(train_data)} samples")
print(f"✓ Eval: {len(eval_data)} samples") 
print(f"✓ Test: {len(test_data)} samples")

# Save splits to separate JSON files
print(f"\nSaving dataset splits to files...")
with open('train_data.json', 'w') as f:
    json.dump(train_data, f, indent=2)
    
with open('eval_data.json', 'w') as f:
    json.dump(eval_data, f, indent=2)
    
with open('test_data.json', 'w') as f:
    json.dump(test_data, f, indent=2)

print(f"✓ Saved train_data.json ({len(train_data)} samples)")
print(f"✓ Saved eval_data.json ({len(eval_data)} samples)")
print(f"✓ Saved test_data.json ({len(test_data)} samples)")

# Step 10: Create HF datasets
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)
test_dataset = Dataset.from_list(test_data)

# Step 11: Format prompts correctly for standard Alpaca format
EOS_TOKEN = tokenizer.eos_token if tokenizer.eos_token else "</s>"

def formatting_prompts_func(examples):
    texts = []
    for i in range(len(examples["instruction"])):
        # Modified line: Directly concatenate instruction, input, and output (JSON)
        # without the "### Response:" tag from the alpaca_prompt.
        text = f"{examples['instruction'][i]}\n\n### Input:\n{examples['input'][i]}\n{examples['output'][i]}"
        text = text + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Apply formatting
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True)

# Step 12: Save formatted datasets for inspection
print(f"\nSaving formatted datasets...")

# Save the formatted text data so you can see what's being fed to the model
formatted_train_samples = []
formatted_eval_samples = []

for i in range(min(10, len(train_dataset))):  # Save first 10 samples for inspection
    formatted_train_samples.append({
        "raw_instruction": train_data[i]["instruction"],
        "raw_input": train_data[i]["input"],
        "raw_output": train_data[i]["output"], 
        "formatted_text": train_dataset[i]["text"]
    })

for i in range(min(5, len(eval_dataset))):  # Save first 5 eval samples
    formatted_eval_samples.append({
        "raw_instruction": eval_data[i]["instruction"],
        "raw_input": eval_data[i]["input"],
        "raw_output": eval_data[i]["output"],
        "formatted_text": eval_dataset[i]["text"]
    })

with open('formatted_train_samples.json', 'w') as f:
    json.dump(formatted_train_samples, f, indent=2)
    
with open('formatted_eval_samples.json', 'w') as f:
    json.dump(formatted_eval_samples, f, indent=2)

print(f"✓ Saved formatted_train_samples.json (first 10 training examples)")
print(f"✓ Saved formatted_eval_samples.json (first 5 eval examples)")
print(f"✓ You can now inspect these files to see exactly what the model will be trained on!")

print("\n✓ Dataset preparation complete!")
print(f"\nSample formatted text (first 500 chars):")
print(train_dataset[0]["text"][:500] + "...")

# Verify JSON structure
print(f"\nVerifying data quality...")
try:
    sample_output = json.loads(train_data[0]["output"])
    print("✓ JSON structure is valid")
    print(f"✓ Sample output keys: {list(sample_output.keys())}")
except Exception as e:
    print(f"✗ JSON structure issue: {e}")

# Final memory check
if torch.cuda.is_available():
    print(f"\nGPU Memory Usage:")
    print(f"- Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    print(f"- Reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

In [None]:
# Fixed Cell 4: Enhanced Training from Scratch with System Prompt
import torch
import gc
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, TrainerCallback
from datasets import Dataset
import json
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np

# Clear GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()

print("=== ENHANCED TRAINING WITH SYSTEM PROMPT ===\n")

# Define your SYSTEM PROMPT (same as in cell 3)
SYSTEM_PROMPT = """You are an automated banking customer service ticket analysis system. Your purpose is to parse a customer's request and structure it into a standardized JSON format for internal ticketing.

You must perform the following actions:
1. Carefully analyze the user's input to understand their intent and key details.
2. Populate all fields in the JSON object based only on the user's text. Do not invent information.
3. Adhere strictly to the defined categories for ticket_type, severity, and other categorical fields.
4. If the user's request is NOT related to banking or financial services (e.g., tech support for a personal computer, dating advice), you MUST reject it by responding with {"error": "Request is outside the banking support domain."}.
5. Your entire response must be ONLY the JSON object, with no conversational text, apologies, or explanations.

The required JSON format is:
{
    "ticket_type": "complaint" | "inquiry" | "assistance",
    "title": "A brief, descriptive summary of the user's issue.",
    "description": "A more detailed description based on the user's full input.",
    "severity": "low" | "medium" | "high" | "critical",
    "department_impacted": "The most relevant bank department.",
    "service_impacted": "The specific banking service affected.",
    "supporting_documents": "Attached documents and files by the customer",
    "preferred_communication": "preferred method to contact the customer"
}"""

# Load base model (not the fine-tuned one)
print("Loading base model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    max_seq_length=1024,
    dtype=torch.float16,
    load_in_4bit=True,
    device_map="sequential",
    trust_remote_code=True,
)

# Add enhanced LoRA adapter with rank 32 from the start
print("\nAdding enhanced LoRA adapter (rank 32)...")
model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # Higher rank from the start
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
    use_rslora=False,
    loftq_config=None,
)

# Load datasets
print("\nLoading datasets...")
with open('train_data.json', 'r') as f:
    train_data = json.load(f)
with open('eval_data.json', 'r') as f:
    eval_data = json.load(f)
with open('test_data.json', 'r') as f:
    test_data = json.load(f)

print(f"✓ Loaded datasets - Train: {len(train_data)}, Eval: {len(eval_data)}, Test: {len(test_data)}")

# Format datasets
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}"""

EOS_TOKEN = tokenizer.eos_token if tokenizer.eos_token else "</s>"

# Create formatted datasets
print("\nFormatting datasets...")
train_texts = []
eval_texts = []
test_texts = []

for item in train_data:
    text = alpaca_prompt.format(
        instruction=item["instruction"],
        input=item["input"],
        output=item["output"]
    ) + EOS_TOKEN
    train_texts.append(text)

for item in eval_data:
    text = alpaca_prompt.format(
        instruction=item["instruction"],
        input=item["input"],
        output=item["output"]
    ) + EOS_TOKEN
    eval_texts.append(text)

for item in test_data:
    text = alpaca_prompt.format(
        instruction=item["instruction"],
        input=item["input"],
        output=item["output"]
    ) + EOS_TOKEN
    test_texts.append(text)

train_dataset = Dataset.from_dict({"text": train_texts})
eval_dataset = Dataset.from_dict({"text": eval_texts})
test_dataset = Dataset.from_dict({"text": test_texts})

# Enhanced callback for comprehensive metrics tracking
class EnhancedLossCallback(TrainerCallback):
    def __init__(self):
        self.train_losses = []
        self.eval_losses = []
        self.learning_rates = []
        self.steps = []
        self.eval_steps = []
        
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if "loss" in logs:
                self.train_losses.append(logs["loss"])
                self.steps.append(state.global_step)
            if "eval_loss" in logs:
                self.eval_losses.append(logs["eval_loss"])
                self.eval_steps.append(state.global_step)
            if "learning_rate" in logs:
                self.learning_rates.append(logs["learning_rate"])
                
    def plot_metrics(self, save_path="enhanced_training_metrics.png"):
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
        
        # Plot losses
        if self.train_losses:
            ax1.plot(self.steps, self.train_losses, 'b-', label='Training Loss', alpha=0.7)
        if self.eval_losses:
            ax1.plot(self.eval_steps, self.eval_losses, 'r-', label='Validation Loss', linewidth=2)
        ax1.set_xlabel('Steps')
        ax1.set_ylabel('Loss')
        ax1.set_title('Training and Validation Loss - Enhanced Configuration')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        ax1.set_ylim(bottom=0)
        
        # Plot learning rate
        if self.learning_rates:
            ax2.plot(self.steps[:len(self.learning_rates)], self.learning_rates, 'g-', label='Learning Rate')
            ax2.set_xlabel('Steps')
            ax2.set_ylabel('Learning Rate')
            ax2.set_title('Learning Rate Schedule')
            ax2.legend()
            ax2.grid(True, alpha=0.3)
            ax2.ticklabel_format(style='scientific', axis='y', scilimits=(0,0))
        
        plt.tight_layout()
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()
        
        # Print statistics
        if self.train_losses and self.eval_losses:
            print(f"\nTraining Statistics:")
            print(f"  - Initial training loss: {self.train_losses[0]:.4f}")
            print(f"  - Final training loss: {self.train_losses[-1]:.4f}")
            print(f"  - Best training loss: {min(self.train_losses):.4f}")
            print(f"  - Initial validation loss: {self.eval_losses[0]:.4f}")
            print(f"  - Final validation loss: {self.eval_losses[-1]:.4f}")
            print(f"  - Best validation loss: {min(self.eval_losses):.4f}")

loss_callback = EnhancedLossCallback()

# Optional W&B integration
use_wandb = False  # Set to True to use Weights & Biases
if use_wandb:
    try:
        import wandb
        wandb.init(
            project="banking-assistant-enhanced",
            name=f"qwen-7b-rank32-{datetime.now().strftime('%Y%m%d_%H%M%S')}",
            config={
                "model": "Qwen2.5-7B-Instruct",
                "lora_rank": 32,
                "dataset_size": len(train_data),
                "epochs": 5,
                "batch_size": 2,
                "learning_rate": 1.5e-4,
            }
        )
        report_to = "wandb"
        print("✓ Weights & Biases initialized")
    except ImportError:
        print("⚠️ W&B not installed. Install with: pip install wandb")
        report_to = "none"
else:
    report_to = "none"

# Optimized training arguments
training_args = TrainingArguments(
    output_dir="./banking_assistant_enhanced_v2",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=5,  # 5 epochs total with enhanced config
    logging_steps=10,
    eval_steps=30,
    save_steps=60,  # Changed to be multiple of eval_steps (30 * 2 = 60)
    eval_strategy="steps",
    save_strategy="steps",
    learning_rate=1.5e-4,  # Balanced learning rate
    fp16=True,
    warmup_steps=100,  # Longer warmup for stability
    lr_scheduler_type="cosine",
    seed=42,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=report_to,
    remove_unused_columns=True,
    ddp_find_unused_parameters=False if torch.cuda.device_count() > 1 else None,
    optim="adamw_torch",
    weight_decay=0.01,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,
    gradient_checkpointing=True,
)

# Initialize trainer
print("\nInitializing enhanced trainer...")
print(f"Configuration:")
print(f"  - LoRA rank: 32")
print(f"  - Learning rate: {training_args.learning_rate}")
print(f"  - Epochs: {training_args.num_train_epochs}")
print(f"  - Warmup steps: {training_args.warmup_steps}")
print(f"  - Weight decay: {training_args.weight_decay}")
print(f"  - Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=1024,
    dataset_num_proc=2,
    packing=False,
    args=training_args,
    callbacks=[loss_callback],
)

# Start training
print("\nStarting enhanced training from scratch...")
total_steps = len(train_dataset) // training_args.per_device_train_batch_size // training_args.gradient_accumulation_steps * training_args.num_train_epochs
print(f"Total training steps: {total_steps}")

try:
    # Train the model
    trainer.train()
    
    # Plot comprehensive metrics
    print("\nPlotting training metrics...")
    loss_callback.plot_metrics(save_path=f"enhanced_metrics_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png")
    
    # Save the model
    print("\nSaving enhanced model...")
    model.save_pretrained("banking_assistant_enhanced_final")
    tokenizer.save_pretrained("banking_assistant_enhanced_final")
    
    # Save merged version
    print("\nSaving merged model...")
    model.save_pretrained_merged("banking_assistant_enhanced_merged_final", tokenizer, save_method="merged_16bit")
    
    print("\n✓ Training completed successfully!")
    
except KeyboardInterrupt:
    print("\n\nTraining interrupted by user.")
    print("Saving checkpoint...")
    model.save_pretrained("banking_assistant_enhanced_checkpoint")
    tokenizer.save_pretrained("banking_assistant_enhanced_checkpoint")
    loss_callback.plot_metrics(save_path="enhanced_metrics_interrupted.png")
    
except Exception as e:
    print(f"\n✗ Training error: {e}")
    raise

# Custom evaluation function for test set
print("\n" + "="*50)
print("EVALUATING ON TEST SET")
print("="*50)

def evaluate_on_test_set(model, tokenizer, test_data, batch_size=4):
    """Custom evaluation function with proper loss calculation"""
    model.eval()
    total_loss = 0
    total_samples = 0
    
    print(f"Evaluating {len(test_data)} test samples...")
    
    with torch.no_grad():
        for i in range(0, len(test_data), batch_size):
            batch_texts = []
            for j in range(i, min(i+batch_size, len(test_data))):
                text = alpaca_prompt.format(
                    instruction=test_data[j]["instruction"],
                    input=test_data[j]["input"],
                    output=test_data[j]["output"]
                ) + EOS_TOKEN
                batch_texts.append(text)
            
            # Tokenize batch
            encodings = tokenizer(
                batch_texts,
                truncation=True,
                padding=True,
                max_length=1024,
                return_tensors="pt"
            ).to("cuda")
            
            # Create labels (same as input_ids for language modeling)
            labels = encodings["input_ids"].clone()
            
            # Forward pass
            outputs = model(**encodings, labels=labels)
            total_loss += outputs.loss.item() * len(batch_texts)
            total_samples += len(batch_texts)
            
            # Progress indicator
            if (i // batch_size + 1) % 10 == 0:
                print(f"  Evaluated {min(i+batch_size, len(test_data))}/{len(test_data)} samples...")
    
    avg_loss = total_loss / total_samples
    perplexity = np.exp(avg_loss)
    
    return avg_loss, perplexity

# Run evaluation
test_loss, test_perplexity = evaluate_on_test_set(model, tokenizer, test_data)
print(f"\nTest Loss: {test_loss:.4f}")
print(f"Test Perplexity: {test_perplexity:.2f}")

# Generate sample predictions
print("\n" + "="*50)
print("SAMPLE PREDICTIONS")
print("="*50)

# Enable inference mode
FastLanguageModel.for_inference(model)

# Diverse test cases - including non-banking requests
test_cases = [
    {
        "input": "My debit card was declined at the ATM but I have sufficient funds.",
        "expected_type": "complaint"
    },
    {
        "input": "I'd like information about opening a business checking account.",
        "expected_type": "inquiry"
    },
    {
        "input": "Someone made unauthorized purchases with my credit card in another country.",
        "expected_type": "complaint",
        "expected_severity": "critical"
    },
    {
        "input": "Can you help me set up automatic bill payments?",
        "expected_type": "assistance"
    },
    {
        "input": "The mobile app crashes every time I try to check my balance.",
        "expected_type": "assistance"
    },
    {
        "input": "Can you help me fix my laptop? It won't start.",
        "expected_output": "error",
        "expected_error": True
    },
    {
        "input": "I need relationship advice.",
        "expected_output": "error",
        "expected_error": True
    }]

In [None]:
# 5. upload adapters 
# Upload LoRA Adapters to Hugging Face
import os
from huggingface_hub import HfApi, create_repo, upload_folder
from huggingface_hub import login
import json

# Configuration
ADAPTER_PATH = "banking_assistant_enhanced_final"
HF_REPO_NAME = "LaythAbuJafar/QwenInstruct_Agent1_Adapters"
HF_TOKEN = "YOUR_HF_TOKEN_HERE"  # Replace with your actual token

print("=== Uploading LoRA Adapters to Hugging Face ===\n")

# Step 1: Login to Hugging Face
print("Logging in to Hugging Face...")
login(token=HF_TOKEN)
print("✓ Logged in successfully")

# Step 2: Create or verify repository exists
api = HfApi()
try:
    # Try to create the repo (will fail if it already exists, which is fine)
    create_repo(
        repo_id=HF_REPO_NAME,
        repo_type="model",
        private=False,  # Set to True if you want a private repo
        exist_ok=True
    )
    print(f"✓ Repository '{HF_REPO_NAME}' is ready")
except Exception as e:
    print(f"Repository already exists or error: {e}")

# Step 3: Create a comprehensive model card
print("\nCreating model card...")
model_card_content = """---
base_model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
tags:
- banking
- customer-service
- json-generation
- lora
- qwen2.5
- unsloth
license: apache-2.0
language:
- en
pipeline_tag: text-generation
library_name: peft
---

# QwenInstruct Banking Agent - LoRA Adapters

This model is a fine-tuned version of Qwen2.5-7B-Instruct for banking customer service ticket generation.

## Model Details

- **Base Model**: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
- **Fine-tuning Method**: LoRA (Low-Rank Adaptation)
- **LoRA Rank**: 32
- **LoRA Alpha**: 32
- **Target Modules**: q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj
- **Training Framework**: Unsloth + TRL

## Training Details

- **Dataset Size**: 1,000 banking customer service examples
- **Training Split**: 70% train, 15% validation, 15% test
- **Epochs**: 5
- **Learning Rate**: 1.5e-4
- **Batch Size**: 8 (2 * 4 gradient accumulation)
- **Optimizer**: AdamW
- **Weight Decay**: 0.01

## Performance

- **Final Training Loss**: 0.055
- **Final Validation Loss**: 0.072
- **JSON Generation Success Rate**: 100%
- **Test Perplexity**: 8.32

## Usage

### Loading the Adapters

```python
from unsloth import FastLanguageModel
import torch

# Load base model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    max_seq_length=1024,
    dtype=torch.float16,
    load_in_4bit=True,
)

# Load LoRA adapters
model = FastLanguageModel.from_pretrained(
    model_name="LaythAbuJafar/QwenInstruct_Agent1_Adapters",
    max_seq_length=1024,
    dtype=torch.float16,
    load_in_4bit=True,
)

# Enable for inference
FastLanguageModel.for_inference(model)
```

### Generation Example

```python
# Define the prompt template
alpaca_prompt = \"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
\"\"\"

# Create prompt
prompt = alpaca_prompt.format(
    instruction="You are a banking customer service assistant. Analyze the customer input and create a complaint/inquiry ticket in valid JSON format. The JSON must include these fields: ticket_type, title, description, severity, department_impacted, service_impacted, supporting_documents, preferred_communication.",
    input="My credit card was charged twice for the same purchase."
)

# Generate
inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    temperature=0.6,
    top_p=0.9,
    repetition_penalty=1.1,
    do_sample=True,
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
```

## Expected Output Format

The model generates JSON tickets with the following structure:
```json
{
    "ticket_type": "complaint|inquiry|assistance",
    "title": "Brief description of the issue",
    "description": "Detailed explanation of the customer's request",
    "severity": "low|medium|high|critical",
    "department_impacted": "Relevant department",
    "service_impacted": "Specific service affected",
    "supporting_documents": "Required documentation",
    "preferred_communication": "phone|email|chat|not specified"
}
```

## Limitations

- Trained specifically for banking domain
- English language only
- Requires the exact base model (unsloth/Qwen2.5-7B-Instruct-bnb-4bit)

## Citation

If you use this model, please cite:
```
@misc{qweninstruct-banking-agent,
  author = {Layth Abu Jafar},
  title = {QwenInstruct Banking Agent},
  year = {2024},
  publisher = {Hugging Face},
  url = {https://huggingface.co/LaythAbuJafar/QwenInstruct_Agent1_Adapters}
}
```
"""

# Save model card
with open(os.path.join(ADAPTER_PATH, "README.md"), "w") as f:
    f.write(model_card_content)
print("✓ Model card created")

# Step 4: Create adapter info file
adapter_info = {
    "base_model": "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    "model_type": "qwen2.5",
    "fine_tuning_method": "lora",
    "lora_rank": 32,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    "training_framework": "unsloth",
    "dataset_size": 1000,
    "training_epochs": 5,
    "final_loss": 0.055,
    "validation_loss": 0.072
}

with open(os.path.join(ADAPTER_PATH, "training_info.json"), "w") as f:
    json.dump(adapter_info, f, indent=2)
print("✓ Training info saved")

# Step 5: Upload to Hugging Face
print(f"\nUploading adapters to {HF_REPO_NAME}...")
try:
    api.upload_folder(
        folder_path=ADAPTER_PATH,
        repo_id=HF_REPO_NAME,
        repo_type="model",
        commit_message="Upload fine-tuned LoRA adapters for banking customer service"
    )
    print(f"✓ Successfully uploaded to https://huggingface.co/{HF_REPO_NAME}")
except Exception as e:
    print(f"✗ Error uploading: {e}")
    print("\nTroubleshooting:")
    print("1. Make sure you have write access to the repository")
    print("2. Check that your HF token has 'write' permissions")
    print("3. Verify the repository name is correct")

print("\n=== Upload Complete ===")
print(f"Your adapters are now available at: https://huggingface.co/{HF_REPO_NAME}")
print("\nTo use these adapters:")
print(f"model = FastLanguageModel.from_pretrained('{HF_REPO_NAME}', ...)")

In [None]:
# 6. Upload Merged Model to Hugging Face
import os
from huggingface_hub import HfApi, create_repo
from huggingface_hub import login
import json
import shutil

# Configuration
MERGED_MODEL_PATH = "banking_assistant_enhanced_merged_final"
HF_REPO_NAME = "LaythAbuJafar/QwenInstruct_Agent1_Merged"
HF_TOKEN = "YOUR_HF_TOKEN_HERE"  # Replace with your actual token

print("=== Uploading Merged Model to Hugging Face ===\n")

# Step 1: Login to Hugging Face
print("Logging in to Hugging Face...")
login(token=HF_TOKEN)
print("✓ Logged in successfully")

# Step 2: Create or verify repository exists
api = HfApi()
try:
    create_repo(
        repo_id=HF_REPO_NAME,
        repo_type="model",
        private=False,  # Set to True if you want a private repo
        exist_ok=True
    )
    print(f"✓ Repository '{HF_REPO_NAME}' is ready")
except Exception as e:
    print(f"Repository already exists or error: {e}")

# Step 3: Create a comprehensive model card for the merged model
print("\nCreating model card...")
model_card_content = """---
base_model: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
tags:
- banking
- customer-service  
- json-generation
- merged-model
- qwen2.5
- unsloth
- text-generation
license: apache-2.0
language:
- en
pipeline_tag: text-generation
library_name: transformers
---

# QwenInstruct Banking Agent - Merged Model

This is a merged version of the fine-tuned Qwen2.5-7B-Instruct model for banking customer service ticket generation. The LoRA adapters have been merged into the base model for easier deployment.

## Model Details

- **Base Model**: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
- **Fine-tuning Method**: LoRA (merged into base model)
- **Model Format**: 16-bit merged model
- **Model Size**: ~15 GB
- **Training Framework**: Unsloth + TRL

## Training Details

- **LoRA Configuration**:
  - Rank: 32
  - Alpha: 32
  - Dropout: 0.1
  - Target Modules: q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj

- **Training Configuration**:
  - Dataset Size: 1,000 banking examples
  - Epochs: 5
  - Learning Rate: 1.5e-4
  - Batch Size: 8 (effective)
  - Optimizer: AdamW
  - Weight Decay: 0.01

## Performance Metrics

- **Final Training Loss**: 0.055
- **Final Validation Loss**: 0.072  
- **JSON Generation Success Rate**: 100%
- **Test Perplexity**: 8.32

## Usage

### Simple Loading

```python
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the merged model directly
model = AutoModelForCausalLM.from_pretrained(
    "LaythAbuJafar/QwenInstruct_Agent1_Merged",
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("LaythAbuJafar/QwenInstruct_Agent1_Merged")
```

### Using with Unsloth

```python
from unsloth import FastLanguageModel

# Load with Unsloth for optimized inference
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="LaythAbuJafar/QwenInstruct_Agent1_Merged",
    max_seq_length=1024,
    dtype=torch.float16,
    load_in_4bit=True,  # Optional: use 4-bit quantization
)

FastLanguageModel.for_inference(model)
```

### Generation Example

```python
# Define the prompt template
alpaca_prompt = \"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
\"\"\"

# Example usage
instruction = "You are a banking customer service assistant. Analyze the customer input and create a complaint/inquiry ticket in valid JSON format. The JSON must include these fields: ticket_type, title, description, severity, department_impacted, service_impacted, supporting_documents, preferred_communication."
user_input = "I received a suspicious email asking for my banking credentials."

prompt = alpaca_prompt.format(
    instruction=instruction,
    input=user_input
)

# Generate response
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    temperature=0.6,
    top_p=0.9,
    repetition_penalty=1.1,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract JSON from response
json_start = response.find("### Response:") + len("### Response:")
json_response = response[json_start:].strip()
print(json_response)
```

## Expected Output Format

```json
{
    "ticket_type": "complaint|inquiry|assistance",
    "title": "Brief issue description",
    "description": "Detailed customer request explanation",
    "severity": "low|medium|high|critical",
    "department_impacted": "Relevant department",
    "service_impacted": "Affected service",
    "supporting_documents": "Required documents",
    "preferred_communication": "phone|email|chat|not specified"
}
```

## Example Outputs

**Input**: "My credit card payment didn't go through but the money was deducted."
```json
{
    "ticket_type": "complaint",
    "title": "Failed payment with deduction",
    "description": "Customer reports credit card payment failed but money was deducted from account",
    "severity": "high",
    "department_impacted": "Payment Processing",
    "service_impacted": "Credit Card Payments",
    "supporting_documents": "Transaction history, payment confirmation",
    "preferred_communication": "phone"
}
```

## Deployment Tips

1. **Memory Requirements**: ~15 GB for full precision, ~8 GB with 4-bit quantization
2. **Inference Speed**: Use Flash Attention 2 for faster inference
3. **Batch Processing**: Model supports batch inference for multiple tickets
4. **Temperature**: Use 0.5-0.7 for consistent JSON generation

## Limitations

- Specialized for banking domain only
- English language support only
- Requires GPU for optimal performance
- JSON structure is fixed to the trained format

## Citation

```bibtex
@misc{qweninstruct-banking-agent-merged,
  author = {Layth Abu Jafar},
  title = {QwenInstruct Banking Agent - Merged Model},
  year = {2024},
  publisher = {Hugging Face},
  url = {https://huggingface.co/LaythAbuJafar/QwenInstruct_Agent1_Merged}
}
```

## Acknowledgments

- Base model: Qwen Team
- Fine-tuning framework: Unsloth
- Training library: TRL (Transformer Reinforcement Learning)
"""

# Save model card
with open(os.path.join(MERGED_MODEL_PATH, "README.md"), "w") as f:
    f.write(model_card_content)
print("✓ Model card created")

# Step 4: Create model info file
model_info = {
    "model_type": "qwen2.5-merged",
    "base_model": "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    "merge_method": "unsloth_merged_16bit",
    "fine_tuning_details": {
        "method": "lora",
        "rank": 32,
        "alpha": 32,
        "modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    },
    "performance": {
        "final_loss": 0.055,
        "validation_loss": 0.072,
        "json_success_rate": "100%",
        "test_perplexity": 8.32
    },
    "training_framework": "unsloth",
    "model_size": "~15GB",
    "quantization_compatible": True
}

with open(os.path.join(MERGED_MODEL_PATH, "model_info.json"), "w") as f:
    json.dump(model_info, f, indent=2)
print("✓ Model info saved")

# Step 5: Check model files
print("\nChecking model files...")
model_files = os.listdir(MERGED_MODEL_PATH)
print(f"Found {len(model_files)} files to upload:")
for file in model_files[:10]:  # Show first 10 files
    file_path = os.path.join(MERGED_MODEL_PATH, file)
    size_mb = os.path.getsize(file_path) / (1024 * 1024)
    print(f"  - {file}: {size_mb:.1f} MB")
if len(model_files) > 10:
    print(f"  ... and {len(model_files) - 10} more files")

# Step 6: Upload to Hugging Face
print(f"\n⚠️  WARNING: This will upload ~15GB of data!")
print(f"Uploading to {HF_REPO_NAME}...")

try:
    # For large models, it's better to upload in chunks
    # First, let's check the total size
    total_size = sum(os.path.getsize(os.path.join(MERGED_MODEL_PATH, f)) 
                     for f in os.listdir(MERGED_MODEL_PATH))
    total_size_gb = total_size / (1024**3)
    print(f"Total size to upload: {total_size_gb:.2f} GB")
    
    # Upload the model
    api.upload_folder(
        folder_path=MERGED_MODEL_PATH,
        repo_id=HF_REPO_NAME,
        repo_type="model",
        commit_message="Upload merged Qwen2.5-7B banking assistant model",
        # For large uploads, you might want to add:
        # multi_commits=True,
        # multi_commits_verbose=True,
    )
    
    print(f"✓ Successfully uploaded to https://huggingface.co/{HF_REPO_NAME}")
    
except Exception as e:
    print(f"✗ Error uploading: {e}")
    print("\nTroubleshooting for large model uploads:")
    print("1. Make sure you have a stable internet connection")
    print("2. Consider using git-lfs directly for very large files:")
    print("   - git clone https://huggingface.co/YOUR_REPO")
    print("   - git lfs track '*.bin' '*.safetensors'")
    print("   - git add . && git commit -m 'Add model'")
    print("   - git push")
    print("3. You can also use the huggingface-cli:")
    print(f"   huggingface-cli upload {HF_REPO_NAME} {MERGED_MODEL_PATH}")

print("\n=== Upload Complete ===")
print(f"Your merged model is now available at: https://huggingface.co/{HF_REPO_NAME}")
print("\nUsers can now load your model with:")
print(f"model = AutoModelForCausalLM.from_pretrained('{HF_REPO_NAME}')")

In [3]:
#7. quantize after gguf 
import subprocess
import os

# === CONFIGURATION ===
model_path = r"C:\Users\Laith\Desktop\qwen_instruct\QwenInstruct_Agent1_Merged.gguf"
quant_type = "q4_K_M"  # Options: q4_0, q5_1, q8_0, etc.
llama_cpp_bin_dir = r"C:\Users\Laith\Desktop\qwen_instruct\llama.cpp\build\bin\Release"
output_path = os.path.splitext(model_path)[0] + f".{quant_type}.gguf"

# === MAIN LOGIC ===
quantize_exe = os.path.join(llama_cpp_bin_dir, "llama-quantize.exe")
if not os.path.exists(quantize_exe):
    raise FileNotFoundError(f"llama-quantize.exe not found at: {quantize_exe}")
if not os.path.exists(model_path):
    raise FileNotFoundError(f"Model file not found at: {model_path}")

cmd = [quantize_exe, model_path, output_path, quant_type]
print(f"Running: {' '.join(cmd)}")

result = subprocess.run(cmd)

if result.returncode == 0:
    print(f"✅ Success! Quantized model saved to:\n{output_path}")
else:
    print(f"❌ Quantization failed with code {result.returncode}")


Running: C:\Users\Laith\Desktop\qwen_instruct\llama.cpp\build\bin\Release\llama-quantize.exe C:\Users\Laith\Desktop\qwen_instruct\QwenInstruct_Agent1_Merged.gguf C:\Users\Laith\Desktop\qwen_instruct\QwenInstruct_Agent1_Merged.q4_K_M.gguf q4_K_M
✅ Success! Quantized model saved to:
C:\Users\Laith\Desktop\qwen_instruct\QwenInstruct_Agent1_Merged.q4_K_M.gguf


In [5]:
#upload quantized model to hf
from huggingface_hub import HfApi

api = HfApi()

local_path = r"C:\Users\Laith\Desktop\qwen_instruct\QwenInstruct_Agent1_Merged.q4_K_M.gguf"
repo_id = "LaythAbuJafar/Qwen_Insturct7B_Agent1_GGUF_Q"
target_path_in_repo = "QwenInstruct_Agent1_Merged.q4_K_M.gguf"

api.upload_file(
    path_or_fileobj=local_path,
    path_in_repo=target_path_in_repo,
    repo_id=repo_id,
    repo_type="model"
)

print("✅ Upload complete!")


QwenInstruct_Agent1_Merged.q4_K_M.gguf:   0%|          | 0.00/4.68G [00:00<?, ?B/s]

✅ Upload complete!
