# Zero-Shot Property Scoring - Visual Analysis

This notebook performs zero-shot property condition scoring using Vision Language Models (VLMs) without any examples.

We'll:
1. Load annotations and scoring prompt
2. Run scoring with different VLM providers
3. Compare predictions vs ground truth
4. Visualize results with confusion matrices and distributions


In [2]:
import os
import json
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from IPython.display import display, Markdown
from src.data_loader import DataLoader
from src.providers import get_provider
from src.config import Config

# Set up plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Load annotations
loader = DataLoader()
df = loader.load_annotations()

# Filter to only images with expert scores
scored_df = df[df['expert_score'].notna()].copy()
scored_df['expert_score'] = scored_df['expert_score'].astype(int)

# Filter to existing images
scored_df = scored_df[scored_df['image_path'].apply(os.path.exists)]

print(f"📊 Total images with expert scores: {len(scored_df)}")
print(f"\n📊 Score distribution:")
print(scored_df['expert_score'].value_counts().sort_index())

# Sample for testing (use all for full run)
SAMPLE_SIZE = 4  # Change to None to use all images
if SAMPLE_SIZE:
    scored_df = scored_df.sample(n=min(SAMPLE_SIZE, len(scored_df)), random_state=42)
    print(f"\n🎲 Using sample of {len(scored_df)} images for testing")


In [None]:
# Load scoring prompt
prompt_path = os.path.join(Config.PROMPTS_DIR, "prompt_zero_shot.txt")
with open(prompt_path, "r") as f:
    scoring_prompt = f.read()

print("Scoring Prompt (first 500 chars):")
print("=" * 80)
print(scoring_prompt[:500] + "...")
print("=" * 80)


In [None]:
def extract_score_from_response(response):
    """Extract score from VLM response"""
    if not response:
        return None
    
    try:
        # Try to find JSON in response
        if "```json" in response:
            json_str = response.split("```json")[1].split("```")[0].strip()
            parsed = json.loads(json_str)
            return parsed.get("score") or parsed.get("overall_score")
        elif "{" in response and "score" in response.lower():
            # Try to extract JSON object
            json_match = re.search(r'\{[^{}]*"score"[^{}]*\}', response)
            if json_match:
                parsed = json.loads(json_match.group())
                return parsed.get("score")
        
        # Try to find score number
        score_match = re.search(r'"score"\s*:\s*(\d)', response, re.IGNORECASE)
        if score_match:
            return int(score_match.group(1))
        
        # Try to find standalone score
        score_match = re.search(r'\bscore[:\s]+(\d)\b', response, re.IGNORECASE)
        if score_match:
            return int(score_match.group(1))
        
        # Try to find number 1-5
        score_match = re.search(r'\b([1-5])\b', response)
        if score_match:
            return int(score_match.group(1))
    except:
        pass
    
    return None


In [None]:
# Test with different providers
providers_to_test = ["local", "openai", "google", "together"]  # Add/remove as needed
results_by_provider = {}

for provider_name in providers_to_test:
    print(f"\n{'='*60}")
    print(f"Testing {provider_name.upper()} provider")
    print(f"{'='*60}")
    
    try:
        provider = get_provider(provider_name)
        print(f"✅ Model: {provider.model_name}")
        
        results = []
        for idx, (_, row) in enumerate(scored_df.iterrows(), 1):
            img_path = row['image_path']
            expert_score = row['expert_score']
            
            if idx % 10 == 0:
                print(f"  Progress: {idx}/{len(scored_df)}")
            
            try:
                response = provider.analyze(img_path, scoring_prompt)
                predicted_score = extract_score_from_response(response)
                
                results.append({
                    "image_path": img_path,
                    "file_name": os.path.basename(img_path),
                    "expert_score": expert_score,
                    "predicted_score": predicted_score,
                    "provider": provider_name,
                    "model": provider.model_name,
                    "raw_response": response[:200] if response else None
                })
            except Exception as e:
                results.append({
                    "image_path": img_path,
                    "file_name": os.path.basename(img_path),
                    "expert_score": expert_score,
                    "predicted_score": None,
                    "provider": provider_name,
                    "error": str(e)
                })
        
        results_by_provider[provider_name] = pd.DataFrame(results)
        print(f"✅ Completed {provider_name}: {len(results)} images processed")
        
    except Exception as e:
        print(f"❌ Error with {provider_name}: {e}")
        continue


In [None]:
# Display results for each provider
for provider_name, results_df in results_by_provider.items():
    print(f"\n{'='*60}")
    print(f"{provider_name.upper()} Results")
    print(f"{'='*60}")
    display(results_df.head(10))
    
    # Statistics
    valid_predictions = results_df[results_df['predicted_score'].notna()]
    if len(valid_predictions) > 0:
        print(f"\n✅ Valid predictions: {len(valid_predictions)}/{len(results_df)}")
        print(f"📊 Predicted score distribution:")
        print(valid_predictions['predicted_score'].value_counts().sort_index())


In [None]:
# Visualize predictions vs ground truth
for provider_name, results_df in results_by_provider.items():
    valid_df = results_df[results_df['predicted_score'].notna()].copy()
    
    if len(valid_df) == 0:
        continue
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Score distribution comparison
    expert_counts = valid_df['expert_score'].value_counts().sort_index()
    pred_counts = valid_df['predicted_score'].value_counts().sort_index()
    
    comparison = pd.DataFrame({
        'Expert Score': expert_counts,
        'Predicted Score': pred_counts
    }).fillna(0)
    
    comparison.plot(kind='bar', ax=axes[0], color=['skyblue', 'coral'])
    axes[0].set_title(f'{provider_name.upper()}: Score Distribution')
    axes[0].set_xlabel('Score')
    axes[0].set_ylabel('Count')
    axes[0].legend()
    axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)
    
    # Confusion matrix
    cm = confusion_matrix(valid_df['expert_score'], valid_df['predicted_score'], labels=[1,2,3,4,5])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1], 
                xticklabels=[1,2,3,4,5], yticklabels=[1,2,3,4,5])
    axes[1].set_title(f'{provider_name.upper()}: Confusion Matrix')
    axes[1].set_xlabel('Predicted Score')
    axes[1].set_ylabel('Expert Score')
    
    plt.tight_layout()
    plt.show()
    
    # Classification report
    print(f"\n{classification_report(valid_df['expert_score'], valid_df['predicted_score'], labels=[1,2,3,4,5])}")


In [None]:
# Compare all providers side by side
if len(results_by_provider) > 1:
    fig, axes = plt.subplots(1, len(results_by_provider), figsize=(5*len(results_by_provider), 5))
    if len(results_by_provider) == 1:
        axes = [axes]
    
    for idx, (provider_name, results_df) in enumerate(results_by_provider.items()):
        valid_df = results_df[results_df['predicted_score'].notna()]
        
        if len(valid_df) > 0:
            # Scatter plot: predicted vs expert
            axes[idx].scatter(valid_df['expert_score'], valid_df['predicted_score'], alpha=0.5)
            axes[idx].plot([1, 5], [1, 5], 'r--', label='Perfect prediction')
            axes[idx].set_xlabel('Expert Score')
            axes[idx].set_ylabel('Predicted Score')
            axes[idx].set_title(f'{provider_name.upper()}\nAccuracy: {np.mean(valid_df["expert_score"] == valid_df["predicted_score"]):.2%}')
            axes[idx].legend()
            axes[idx].set_xlim(0.5, 5.5)
            axes[idx].set_ylim(0.5, 5.5)
            axes[idx].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


In [None]:
# Save results
output_dir = Config.OUTPUTS_DIR
os.makedirs(output_dir, exist_ok=True)

for provider_name, results_df in results_by_provider.items():
    output_path = os.path.join(output_dir, f"zeroshot_scores_{provider_name}.csv")
    results_df.to_csv(output_path, index=False)
    print(f"✅ {provider_name} results saved to: {output_path}")
