In [5]:
import json
import pandas as pd
import sys
import os

# Add src directory to path to import our modules
sys.path.append('src')

# Import our QLoRA model loading function
from load_qlora_model import load_qlora_model, ask_financial_risk_qlora

# Load the evaluation examples
with open('evaluation_examples.json', 'r') as f:
    evaluation_data = json.load(f)

print(f"‚úÖ Loaded {len(evaluation_data)} evaluation examples")
print(f"üìä Distribution:")
for item in evaluation_data:
    print(f"   ‚Ä¢ {item['answer']}: {item['question'][:50]}...")


‚úÖ Loaded 10 evaluation examples
üìä Distribution:
   ‚Ä¢ Good: Age: 19.0, Occupation: Musician, Annual Income: 18...
   ‚Ä¢ Good: Age: 20.0, Occupation: Doctor, Annual Income: 3819...
   ‚Ä¢ Good: Age: 37.0, Occupation: Doctor, Annual Income: 1461...
   ‚Ä¢ Good: Age: 23.0, Occupation: Musician, Annual Income: 19...
   ‚Ä¢ Standard: Age: 17.0, Occupation: Manager, Annual Income: 442...
   ‚Ä¢ Standard: Age: 31.0, Occupation: Teacher, Annual Income: 788...
   ‚Ä¢ Standard: Age: 39.0, Occupation: Scientist, Annual Income: 3...
   ‚Ä¢ Bad: Age: 22.0, Occupation: Media_Manager, Annual Incom...
   ‚Ä¢ Bad: Age: 38.0, Occupation: Mechanic, Annual Income: 44...
   ‚Ä¢ Bad: Age: 37.0, Occupation: Mechanic, Annual Income: 18...


In [6]:
# Load the QLoRA model
print("üîÑ Loading QLoRA model...")
try:
    llm = load_qlora_model()
    print("‚úÖ QLoRA model loaded successfully!")
except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    llm = None


üîÑ Loading QLoRA model...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


‚úÖ QLoRA model loaded successfully!


In [7]:
# Run predictions on all evaluation samples
if llm is not None:
    print("üöÄ Running predictions on all 10 evaluation samples...")
    print("=" * 100)
    
    results = []
    
    for i, example in enumerate(evaluation_data, 1):
        print(f"\nüìù SAMPLE {i}/{len(evaluation_data)}")
        print(f"Expected: {example['answer']}")
        print(f"Input: {example['question']}")
        print("-" * 80)
        
        try:
            # Get prediction from QLoRA model
            prediction = ask_financial_risk_qlora(example['question'], llm)
            
            # Store result
            result = {
                'sample': i,
                'expected': example['answer'],
                'input': example['question'],
                'prediction': prediction
            }
            results.append(result)
            
            print(f"ü§ñ Model Prediction:")
            print(prediction)
            
        except Exception as e:
            print(f"‚ùå Error getting prediction: {e}")
            result = {
                'sample': i,
                'expected': example['answer'],
                'input': example['question'],
                'prediction': f"ERROR: {e}"
            }
            results.append(result)
        
        print("=" * 100)
    
    print(f"\n‚úÖ Completed predictions for all {len(evaluation_data)} samples!")
    
else:
    print("‚ùå Cannot run predictions - model not loaded")


üöÄ Running predictions on all 10 evaluation samples...

üìù SAMPLE 1/10
Expected: Good
Input: Age: 19.0, Occupation: Musician, Annual Income: 18246.25_, Outstanding Debt: 845.61, Credit Utilization Ratio: 40.44063497957373, Payment Behaviour: Low_spent_Small_value_payments
--------------------------------------------------------------------------------
<reasoning>
The individual is a 19-year-old musician with an annual income of $18,246. This suggests they are likely in their early stages of financial life and may be more susceptible to impulsive spending or debt accumulation due to lack of experience managing finances independently. Their credit utilization ratio at 40% is considered standard for someone who has a mix of revolving (credit cards) and possibly installment debts, which indicates that while not perfect, they are using their available credit within reasonable limits compared to the total amount owed.

Their annual income ($18,246) does not cover their outstanding debt (

In [None]:
# Display summary of all predictions
if llm is not None and 'results' in locals():
    print("\nüìä PREDICTION SUMMARY")
    print("=" * 100)
    
    correct_predictions = 0
    total_predictions = len(results)
    
    for result in results:
        # Extract the answer from prediction (look for <answer> tags)
        prediction_text = result['prediction']
        if '<answer>' in prediction_text and '</answer>' in prediction_text:
            start = prediction_text.find('<answer>') + 8
            end = prediction_text.find('</answer>')
            predicted_answer = prediction_text[start:end].strip()
        else:
            predicted_answer = "Could not parse"
        
        is_correct = predicted_answer == result['expected']
        if is_correct:
            correct_predictions += 1
        
        status = "‚úÖ CORRECT" if is_correct else "‚ùå INCORRECT"
        
        print(f"\nSample {result['sample']}: {status}")
        print(f"Expected: {result['expected']}")
        print(f"Predicted: {predicted_answer}")
        print(f"Input: {result['input'][:60]}...")
    
    accuracy = (correct_predictions / total_predictions) * 100
    print(f"\nüéØ OVERALL ACCURACY: {correct_predictions}/{total_predictions} = {accuracy:.1f}%")
    
else:
    print("‚ùå No results to display")


In [8]:
# Test with LoRA model
from load_lora_model import load_lora_model, ask_financial_risk_lora

print("üîÑ Loading LoRA model...")
try:
    lora_llm = load_lora_model()
    print("‚úÖ LoRA model loaded successfully!")
    
    print("\nüöÄ Running LoRA predictions on all 10 samples...")
    print("=" * 100)
    
    lora_results = []
    correct_lora = 0
    
    for i, example in enumerate(evaluation_data, 1):
        print(f"\nüìù SAMPLE {i}: Expected = {example['answer']}")
        try:
            prediction = ask_financial_risk_lora(example['question'], lora_llm)
            
            # Extract answer from prediction
            if '<answer>' in prediction and '</answer>' in prediction:
                start = prediction.find('<answer>') + 8
                end = prediction.find('</answer>')
                predicted = prediction[start:end].strip()
            else:
                predicted = "Could not parse"
            
            is_correct = predicted == example['answer']
            if is_correct:
                correct_lora += 1
            
            status = "‚úÖ" if is_correct else "‚ùå"
            print(f"{status} Predicted: {predicted}")
            print(f"Input: {example['question'][:60]}...")
            
        except Exception as e:
            print(f"‚ùå Error: {e}")
    
    lora_accuracy = (correct_lora / len(evaluation_data)) * 100
    print(f"\nüéØ LoRA ACCURACY: {correct_lora}/{len(evaluation_data)} = {lora_accuracy:.1f}%")
    
except Exception as e:
    print(f"‚ùå Error loading LoRA model: {e}")


üîÑ Loading LoRA model...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


‚úÖ LoRA model loaded successfully!

üöÄ Running LoRA predictions on all 10 samples...

üìù SAMPLE 1: Expected = Good
<reasoning>
Based on the information provided, a 19-year-old musician with an annual income of $18,246.25 has a credit utilization ratio of 40.44%. This is generally considered high, which could be a red flag for potential lenders. The payment behavior described as "Low_spent_Small_value_payments" suggests that the individual might have a history of making smaller rather than larger payments, which could indicate an inconsistent payment history. However, the annual income of $18,246.25 is not sufficient to cover an outstanding debt of $845.61, which might not be a good sign. Nonetheless, the combination of high credit utilization and low income does not constitute a "Good" risk profile, so we will categorize this as a "Standard" risk.
</reasoning>
<answer>
Standard
</answer>‚ùå Predicted: Standard
Input: Age: 19.0, Occupation: Musician, Annual Income: 18246.25_, O...


In [9]:
# Test with Base Model (no fine-tuning)
from llama_cpp import Llama

print("üîÑ Loading Base Model...")
try:
    base_llm = Llama(
        model_path=r"D:\Narwal\fine-tuning-lora-qlora\qwen2.5-3b-instruct-q8_0.gguf",
        n_ctx=2048,
        n_threads=8,
        n_batch=512,
        verbose=False
    )
    print("‚úÖ Base model loaded successfully!")
    
    print("\nüöÄ Running Base Model predictions on all 10 samples...")
    print("=" * 100)
    
    correct_base = 0
    
    for i, example in enumerate(evaluation_data, 1):
        print(f"\nüìù SAMPLE {i}: Expected = {example['answer']}")
        
        prompt = f"""You are a financial risk analysis assistant.
Respond in the following format:
<reasoning>
(your reasoning here)
</reasoning>
<answer>
Choose exactly one of: "Good", "Bad", or "Standard"
</answer>

{example['question']}
"""
        
        try:
            prediction = ""
            for chunk in base_llm.create_completion(
                prompt,
                max_tokens=500,
                stream=True,
                temperature=0.3,
                top_p=0.9,
                repeat_penalty=1.2,
                top_k=40
            ):
                prediction += chunk["choices"][0]["text"]
            
            # Extract answer from prediction
            if '<answer>' in prediction and '</answer>' in prediction:
                start = prediction.find('<answer>') + 8
                end = prediction.find('</answer>')
                predicted = prediction[start:end].strip()
            else:
                predicted = "Could not parse"
            
            is_correct = predicted == example['answer']
            if is_correct:
                correct_base += 1
            
            status = "‚úÖ" if is_correct else "‚ùå"
            print(f"{status} Predicted: {predicted}")
            print(f"Input: {example['question'][:60]}...")
            
        except Exception as e:
            print(f"‚ùå Error: {e}")
    
    base_accuracy = (correct_base / len(evaluation_data)) * 100
    print(f"\nüéØ BASE MODEL ACCURACY: {correct_base}/{len(evaluation_data)} = {base_accuracy:.1f}%")
    
except Exception as e:
    print(f"‚ùå Error loading Base model: {e}")


üîÑ Loading Base Model...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


‚úÖ Base model loaded successfully!

üöÄ Running Base Model predictions on all 10 samples...

üìù SAMPLE 1: Expected = Good
‚ùå Predicted: Could not parse
Input: Age: 19.0, Occupation: Musician, Annual Income: 18246.25_, O...

üìù SAMPLE 2: Expected = Good
‚ùå Predicted: Could not parse
Input: Age: 20.0, Occupation: Doctor, Annual Income: 38197.25, Outs...

üìù SAMPLE 3: Expected = Good
‚ùå Predicted: Could not parse
Input: Age: 37.0, Occupation: Doctor, Annual Income: 14619.875_, Ou...

üìù SAMPLE 4: Expected = Good
‚úÖ Predicted: Good
Input: Age: 23.0, Occupation: Musician, Annual Income: 19028.33, Ou...

üìù SAMPLE 5: Expected = Standard
‚ùå Predicted: Bad
Input: Age: 17.0, Occupation: Manager, Annual Income: 44286.36, Out...

üìù SAMPLE 6: Expected = Standard
‚úÖ Predicted: Standard
Input: Age: 31.0, Occupation: Teacher, Annual Income: 7889.11, Outs...

üìù SAMPLE 7: Expected = Standard
‚ùå Predicted: Could not parse
Input: Age: 39.0, Occupation: Scientist, Annual Income: 3