# Few-Shot Property Scoring - Visual Analysis

This notebook performs few-shot property condition scoring using Vision Language Models (VLMs) with example images and their annotations.

We'll:
1. Select gold standard examples for each score (1-5)
2. Build few-shot prompts with examples
3. Run scoring on test images
4. Compare with zero-shot results (if available)


In [None]:
import os
import json
import re
import base64
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from IPython.display import display, Image, Markdown
from src.data_loader import DataLoader
from src.providers import get_provider
from src.config import Config

# Set up plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


In [None]:
# Load annotations
loader = DataLoader()
df = loader.load_annotations()

# Filter to only images with expert scores
scored_df = df[df['expert_score'].notna()].copy()
scored_df['expert_score'] = scored_df['expert_score'].astype(int)
scored_df = scored_df[scored_df['image_path'].apply(os.path.exists)]

print(f"📊 Total images with expert scores: {len(scored_df)}")
print(f"\n📊 Score distribution:")
print(scored_df['expert_score'].value_counts().sort_index())


In [None]:
def select_gold_standard_examples(df_annotations, examples_per_score=1):
    """Select representative examples for each score"""
    gold_standards = []
    
    for score in [1, 2, 3, 4, 5]:
        score_images = df_annotations[df_annotations['expert_score'] == score]
        
        if len(score_images) > 0:
            # Select random examples for this score
            selected = score_images.sample(n=min(examples_per_score, len(score_images)), random_state=42)
            
            for _, row in selected.iterrows():
                gold_standards.append({
                    'score': score,
                    'image_path': row['image_path'],
                    'file_name': row['file_name']
                })
    
    return pd.DataFrame(gold_standards)

# Select gold standard examples
EXAMPLES_PER_SCORE = 2  # Number of examples per score
gold_standards = select_gold_standard_examples(scored_df, examples_per_score=EXAMPLES_PER_SCORE)

print(f"✅ Selected {len(gold_standards)} gold standard examples:")
print(gold_standards.groupby('score').size())
print("\nGold standard examples:")
display(gold_standards)


In [None]:
# Load base scoring prompt
prompt_path = os.path.join(Config.PROMPTS_DIR, "prompt_zero_shot.txt")
with open(prompt_path, "r") as f:
    base_prompt = f.read()

print("Base Prompt (first 300 chars):")
print("=" * 80)
print(base_prompt[:300] + "...")
print("=" * 80)


In [None]:
def build_fewshot_prompt(base_prompt, gold_standards_df, provider_name="openai"):
    """Build few-shot prompt with examples"""
    # For now, create a text-based few-shot prompt
    # Note: Some providers (like OpenAI) support image examples in messages
    # This is a simplified version
    
    examples_text = "\n\n## Examples:\n\n"
    
    for _, example in gold_standards_df.iterrows():
        examples_text += f"Example - Score {example['score']}:\n"
        examples_text += f"Image: {example['file_name']}\n"
        examples_text += f"This property received a score of {example['score']}.\n\n"
    
    fewshot_prompt = base_prompt + examples_text
    fewshot_prompt += "\n\nNow analyze the provided image and assign a score following the same criteria."
    
    return fewshot_prompt

fewshot_prompt = build_fewshot_prompt(base_prompt, gold_standards)
print("Few-shot prompt created (length:", len(fewshot_prompt), "chars)")


In [None]:
def extract_score_from_response(response):
    """Extract score from VLM response"""
    if not response:
        return None
    
    try:
        if "```json" in response:
            json_str = response.split("```json")[1].split("```")[0].strip()
            parsed = json.loads(json_str)
            return parsed.get("score") or parsed.get("overall_score")
        elif "{" in response and "score" in response.lower():
            json_match = re.search(r'\{[^{}]*"score"[^{}]*\}', response)
            if json_match:
                parsed = json.loads(json_match.group())
                return parsed.get("score")
        
        score_match = re.search(r'"score"\s*:\s*(\d)', response, re.IGNORECASE)
        if score_match:
            return int(score_match.group(1))
        
        score_match = re.search(r'\bscore[:\s]+(\d)\b', response, re.IGNORECASE)
        if score_match:
            return int(score_match.group(1))
        
        score_match = re.search(r'\b([1-5])\b', response)
        if score_match:
            return int(score_match.group(1))
    except:
        pass
    
    return None


In [None]:
# Run few-shot scoring on test images
provider_name = "openai"  # Change as needed
provider = get_provider(provider_name)

# Use images not in gold standards for testing
test_images = scored_df[~scored_df['file_name'].isin(gold_standards['file_name'])].copy()
test_images = test_images.sample(n=min(30, len(test_images)), random_state=42)

print(f"🧪 Testing on {len(test_images)} images (excluding gold standards)")
print(f"✅ Using provider: {provider_name} ({provider.model_name})")

results = []
for idx, (_, row) in enumerate(test_images.iterrows(), 1):
    img_path = row['image_path']
    expert_score = row['expert_score']
    
    if idx % 5 == 0:
        print(f"  Progress: {idx}/{len(test_images)}")
    
    try:
        response = provider.analyze(img_path, fewshot_prompt)
        predicted_score = extract_score_from_response(response)
        
        results.append({
            "image_path": img_path,
            "file_name": row['file_name'],
            "expert_score": expert_score,
            "predicted_score": predicted_score,
            "provider": provider_name,
            "method": "few-shot",
            "raw_response": response if response else None
        })
    except Exception as e:
        results.append({
            "image_path": img_path,
            "file_name": row['file_name'],
            "expert_score": expert_score,
            "predicted_score": None,
            "provider": provider_name,
            "method": "few-shot",
            "error": str(e)
        })

fewshot_results = pd.DataFrame(results)
print(f"\n✅ Completed few-shot scoring: {len(fewshot_results)} images")


In [None]:
# Display results
display(fewshot_results.head(10))

valid_predictions = fewshot_results[fewshot_results['predicted_score'].notna()]
if len(valid_predictions) > 0:
    print(f"\n✅ Valid predictions: {len(valid_predictions)}/{len(fewshot_results)}")
    print(f"📊 Accuracy: {np.mean(valid_predictions['expert_score'] == valid_predictions['predicted_score']):.2%}")
    print(f"\n📊 Predicted score distribution:")
    print(valid_predictions['predicted_score'].value_counts().sort_index())


In [None]:
# Visualize few-shot results
valid_df = fewshot_results[fewshot_results['predicted_score'].notna()].copy()

if len(valid_df) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Score distribution comparison
    expert_counts = valid_df['expert_score'].value_counts().sort_index()
    pred_counts = valid_df['predicted_score'].value_counts().sort_index()
    
    comparison = pd.DataFrame({
        'Expert Score': expert_counts,
        'Predicted Score': pred_counts
    }).fillna(0)
    
    comparison.plot(kind='bar', ax=axes[0], color=['skyblue', 'coral'])
    axes[0].set_title('Few-Shot: Score Distribution')
    axes[0].set_xlabel('Score')
    axes[0].set_ylabel('Count')
    axes[0].legend()
    axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)
    
    # Confusion matrix
    cm = confusion_matrix(valid_df['expert_score'], valid_df['predicted_score'], labels=[1,2,3,4,5])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1], 
                xticklabels=[1,2,3,4,5], yticklabels=[1,2,3,4,5])
    axes[1].set_title('Few-Shot: Confusion Matrix')
    axes[1].set_xlabel('Predicted Score')
    axes[1].set_ylabel('Expert Score')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\n{classification_report(valid_df['expert_score'], valid_df['predicted_score'], labels=[1,2,3,4,5])}")


In [None]:
# Compare with zero-shot results if available
zeroshot_path = os.path.join(Config.OUTPUTS_DIR, f"zeroshot_scores_{provider_name}.csv")

if os.path.exists(zeroshot_path):
    zeroshot_results = pd.read_csv(zeroshot_path)
    zeroshot_valid = zeroshot_results[zeroshot_results['predicted_score'].notna()]
    
    if len(zeroshot_valid) > 0 and len(valid_df) > 0:
        # Compare accuracies
        zeroshot_acc = np.mean(zeroshot_valid['expert_score'] == zeroshot_valid['predicted_score'])
        fewshot_acc = np.mean(valid_df['expert_score'] == valid_df['predicted_score'])
        
        print(f"📊 Comparison:")
        print(f"  Zero-shot accuracy: {zeroshot_acc:.2%}")
        print(f"  Few-shot accuracy: {fewshot_acc:.2%}")
        print(f"  Improvement: {fewshot_acc - zeroshot_acc:+.2%}")
        
        # Side-by-side visualization
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        for idx, (method, df_method) in enumerate([("Zero-Shot", zeroshot_valid), ("Few-Shot", valid_df)]):
            axes[idx].scatter(df_method['expert_score'], df_method['predicted_score'], alpha=0.5)
            axes[idx].plot([1, 5], [1, 5], 'r--', label='Perfect')
            axes[idx].set_xlabel('Expert Score')
            axes[idx].set_ylabel('Predicted Score')
            acc = np.mean(df_method['expert_score'] == df_method['predicted_score'])
            axes[idx].set_title(f'{method}\nAccuracy: {acc:.2%}')
            axes[idx].legend()
            axes[idx].set_xlim(0.5, 5.5)
            axes[idx].set_ylim(0.5, 5.5)
            axes[idx].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
else:
    print("⚠️  Zero-shot results not found. Run notebook 02 first to compare.")


In [None]:
# Save results
output_dir = Config.OUTPUTS_DIR
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, f"fewshot_scores_{provider_name}.csv")
fewshot_results.to_csv(output_path, index=False)
print(f"✅ Results saved to: {output_path}")
