In [None]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from matplotlib.colors import Normalize
import matplotlib.cm as cm

# Environment setup
RMOOD_HOME = os.getenv("RMOOD_HOME", "/root/reward-modeling-develop")

In [None]:
def visualize_pca_with_rewards(reward_model_name, prompt_idx, show_weight=True):
    """
    특정 prompt에 대한 512개의 representation을 PCA로 2D 축소하고,
    reward 값으로 colormap하여 시각화. Score layer의 weight도 함께 표시.
    
    Args:
        reward_model_name (str): Reward model name (e.g., "Hahmdong/RMOOD-qwen3-4b-alpacafarm-rm")
        prompt_idx (int): Prompt index (e.g., 0, 1, 2, ...)
        show_weight (bool): If True, show the score layer weight vector
    """
    # Clean model name for file path
    reward_model_name_clean = reward_model_name.replace("/", "_")
    
    # Construct file paths
    representation_path = f"{RMOOD_HOME}/datasets/alpacafarm/distribution/{reward_model_name_clean}/representation_{prompt_idx}.npy"
    reward_path = f"{RMOOD_HOME}/datasets/alpacafarm/distribution/{reward_model_name_clean}/reward_{prompt_idx}.json"
    weight_path = f"{RMOOD_HOME}/datasets/alpacafarm/distribution/{reward_model_name_clean}/weight.npy"
    
    # Check if files exist
    if not os.path.exists(representation_path):
        raise FileNotFoundError(f"Representation file not found: {representation_path}")
    if not os.path.exists(reward_path):
        raise FileNotFoundError(f"Reward file not found: {reward_path}")
    
    # Load representation data
    print(f"Loading representations from: {representation_path}")
    representations = np.load(representation_path)
    print(f"Representation shape: {representations.shape}")
    
    # Load reward data
    print(f"Loading rewards from: {reward_path}")
    with open(reward_path, "r") as f:
        rewards_data = json.load(f)
    rewards = np.array(rewards_data[0])  # Extract the list from the outer array
    print(f"Reward shape: {rewards.shape}")
    
    # Load weight if available and requested
    weight = None
    if show_weight and os.path.exists(weight_path):
        print(f"Loading weight from: {weight_path}")
        weight = np.load(weight_path)
        print(f"Weight shape: {weight.shape}")
        # weight is shape (1, hidden_dim), squeeze to (hidden_dim,)
        weight = weight.squeeze()
    else:
        if show_weight:
            print(f"Warning: Weight file not found at {weight_path}")
    
    # Validate data
    assert len(representations) == len(rewards), f"Mismatch: {len(representations)} representations vs {len(rewards)} rewards"
    assert len(representations) == 512, f"Expected 512 samples, got {len(representations)}"
    
    # Apply PCA to reduce to 2D
    print("Applying PCA...")
    pca = PCA(n_components=2)
    representations_2d = pca.fit_transform(representations)
    
    # Transform weight vector to 2D PCA space
    weight_2d = None
    if weight is not None:
        weight_2d = pca.transform(weight.reshape(1, -1))[0]
        print(f"Weight in 2D PCA space: {weight_2d}")
    
    # Print explained variance
    explained_variance = pca.explained_variance_ratio_
    print(f"Explained variance ratio: PC1={explained_variance[0]:.4f}, PC2={explained_variance[1]:.4f}")
    print(f"Total explained variance: {sum(explained_variance):.4f}")
    
    # Create visualization
    plt.figure(figsize=(12, 10))
    
    # Normalize rewards for colormap
    norm = Normalize(vmin=rewards.min(), vmax=rewards.max())
    
    # Scatter plot for representations
    scatter = plt.scatter(
        representations_2d[:, 0], 
        representations_2d[:, 1],
        c=rewards,
        cmap='viridis',
        alpha=0.6,
        s=50,
        edgecolors='black',
        linewidth=0.5,
        label='Representations'
    )
    
    # Plot weight vector as arrow from center
    if weight_2d is not None:
        # Compute the center of mass of representations for better visualization
        center = representations_2d.mean(axis=0)
        
        # Scale weight vector for better visualization
        # Scale it to be similar magnitude as the spread of points
        data_scale = np.std(representations_2d, axis=0).mean()
        weight_norm = np.linalg.norm(weight_2d)
        if weight_norm > 0:
            scaled_weight = weight_2d * (data_scale * 2.0) / weight_norm
        else:
            scaled_weight = weight_2d
        
        # Draw arrow from center
        plt.arrow(center[0], center[1], 
                 scaled_weight[0], scaled_weight[1],
                 head_width=data_scale*0.15, 
                 head_length=data_scale*0.2,
                 fc='red', ec='darkred', 
                 linewidth=3, 
                 alpha=0.8,
                 length_includes_head=True,
                 label='Score Weight Direction',
                 zorder=5)
        
        # Mark the center point
        plt.scatter(center[0], center[1], 
                   c='red', s=100, marker='x', 
                   linewidths=3, zorder=6,
                   label='Center of Representations')
    
    # Add colorbar
    cbar = plt.colorbar(scatter)
    cbar.set_label('Reward', rotation=270, labelpad=20, fontsize=12)
    
    # Labels and title
    plt.xlabel(f'PC1 ({explained_variance[0]:.2%} variance)', fontsize=12)
    plt.ylabel(f'PC2 ({explained_variance[1]:.2%} variance)', fontsize=12)
    title = f'PCA Visualization of Responses (Prompt {prompt_idx})\\nReward Model: {reward_model_name}'
    if weight_2d is not None:
        title += '\\n(Red arrow shows score layer weight direction)'
    plt.title(title, fontsize=14, pad=20)
    
    # Add grid
    plt.grid(True, alpha=0.3, linestyle='--')
    
    # Add legend
    if weight_2d is not None:
        plt.legend(loc='upper right', fontsize=10)
    
    # Add statistics text
    stats_text = f'Total samples: {len(rewards)}\\n'
    stats_text += f'Reward range: [{rewards.min():.3f}, {rewards.max():.3f}]\\n'
    stats_text += f'Mean reward: {rewards.mean():.3f} ± {rewards.std():.3f}'
    if weight is not None:
        stats_text += f'\\nWeight norm: {np.linalg.norm(weight):.3f}'
    plt.text(0.02, 0.98, stats_text, 
             transform=plt.gca().transAxes,
             fontsize=10,
             verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    plt.tight_layout()
    
    # Save figure
    output_dir = f"{RMOOD_HOME}/rmood/distribution/visualization/outputs"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/pca_prompt_{prompt_idx}_{reward_model_name_clean}.png"
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"\\nFigure saved to: {output_path}")
    
    plt.show()
    
    return representations_2d, rewards, pca, weight_2d

## Example Usage

아래 셀을 실행하여 특정 prompt에 대한 PCA 시각화를 생성할 수 있습니다.
빨간 화살표는 score layer의 weight 방향을 나타냅니다.

In [None]:
# Example: Visualize prompt index 0 with reward model
reward_model_name = "Hahmdong/RMOOD-qwen3-4b-alpacafarm-rm"
prompt_idx = 0

representations_2d, rewards, pca, weight_2d = visualize_pca_with_rewards(
    reward_model_name, 
    prompt_idx, 
    show_weight=True
)

## 다른 Prompt 인덱스 시각화

다른 prompt를 시각화하려면 아래와 같이 `prompt_idx`를 변경하세요.

In [None]:
# Visualize different prompt indices
for idx in [1, 2, 3]:  # Add more indices as needed
    try:
        print(f"\\n{'='*60}")
        print(f"Processing Prompt {idx}")
        print(f"{'='*60}")
        representations_2d, rewards, pca, weight_2d = visualize_pca_with_rewards(
            reward_model_name, idx, show_weight=True
        )
    except FileNotFoundError as e:
        print(f"Skipping prompt {idx}: {e}")

## 추가 분석: Reward 분포 히스토그램

In [None]:
def plot_reward_distribution(reward_model_name, prompt_idx):
    """
    특정 prompt에 대한 reward 분포를 히스토그램으로 시각화
    """
    reward_model_name_clean = reward_model_name.replace("/", "_")
    reward_path = f"{RMOOD_HOME}/datasets/alpacafarm/distribution/{reward_model_name_clean}/reward_{prompt_idx}.json"
    
    with open(reward_path, "r") as f:
        rewards_data = json.load(f)
    rewards = np.array(rewards_data[0])
    
    plt.figure(figsize=(10, 6))
    plt.hist(rewards, bins=50, alpha=0.7, color='blue', edgecolor='black')
    plt.xlabel('Reward', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.title(f'Reward Distribution (Prompt {prompt_idx})\\nReward Model: {reward_model_name}', 
              fontsize=14, pad=20)
    plt.grid(True, alpha=0.3, linestyle='--')
    
    # Add statistics
    stats_text = f'Mean: {rewards.mean():.3f}\\n'
    stats_text += f'Std: {rewards.std():.3f}\\n'
    stats_text += f'Min: {rewards.min():.3f}\\n'
    stats_text += f'Max: {rewards.max():.3f}'
    plt.text(0.98, 0.98, stats_text, 
             transform=plt.gca().transAxes,
             fontsize=10,
             verticalalignment='top',
             horizontalalignment='right',
             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    plt.tight_layout()
    
    # Save figure
    output_dir = f"{RMOOD_HOME}/rmood/distribution/visualization/outputs"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/reward_dist_prompt_{prompt_idx}_{reward_model_name_clean}.png"
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"Figure saved to: {output_path}")
    
    plt.show()

# Example usage
plot_reward_distribution(reward_model_name, prompt_idx)

## Weight 방향과 Reward의 관계 분석

Score layer의 weight 방향과 실제 reward 값의 상관관계를 분석합니다.

In [None]:
def analyze_weight_correlation(reward_model_name, prompt_idx):
    """
    Weight 방향과 representation의 내적(projection)을 계산하고 실제 reward와 비교
    """
    reward_model_name_clean = reward_model_name.replace("/", "_")
    
    # Load data
    representation_path = f"{RMOOD_HOME}/datasets/alpacafarm/distribution/{reward_model_name_clean}/representation_{prompt_idx}.npy"
    reward_path = f"{RMOOD_HOME}/datasets/alpacafarm/distribution/{reward_model_name_clean}/reward_{prompt_idx}.json"
    weight_path = f"{RMOOD_HOME}/datasets/alpacafarm/distribution/{reward_model_name_clean}/weight.npy"
    
    representations = np.load(representation_path)
    with open(reward_path, "r") as f:
        rewards = np.array(json.load(f)[0])
    weight = np.load(weight_path).squeeze()
    
    # Compute projections (inner products)
    projections = representations @ weight
    
    # Plot correlation
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Left: Scatter plot
    axes[0].scatter(projections, rewards, alpha=0.5, s=30)
    axes[0].set_xlabel('Projection on Weight (representation · weight)', fontsize=11)
    axes[0].set_ylabel('Actual Reward', fontsize=11)
    axes[0].set_title(f'Weight Projection vs Actual Reward (Prompt {prompt_idx})', fontsize=12)
    axes[0].grid(True, alpha=0.3)
    
    # Add correlation coefficient
    correlation = np.corrcoef(projections, rewards)[0, 1]
    axes[0].text(0.05, 0.95, f'Correlation: {correlation:.4f}', 
                transform=axes[0].transAxes,
                fontsize=10,
                verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.5))
    
    # Right: Residual plot
    residuals = rewards - projections
    axes[1].scatter(projections, residuals, alpha=0.5, s=30, color='red')
    axes[1].axhline(y=0, color='black', linestyle='--', linewidth=1)
    axes[1].set_xlabel('Projection on Weight', fontsize=11)
    axes[1].set_ylabel('Residual (Reward - Projection)', fontsize=11)
    axes[1].set_title('Residual Plot', fontsize=12)
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    # Save figure
    output_dir = f"{RMOOD_HOME}/rmood/distribution/visualization/outputs"
    os.makedirs(output_dir, exist_ok=True)
    output_path = f"{output_dir}/weight_correlation_prompt_{prompt_idx}_{reward_model_name_clean}.png"
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"Figure saved to: {output_path}")
    print(f"Correlation coefficient: {correlation:.4f}")
    print(f"Mean residual: {residuals.mean():.4f} ± {residuals.std():.4f}")
    
    plt.show()
    
    return projections, correlation

# Example usage
projections, correlation = analyze_weight_correlation(reward_model_name, prompt_idx)

## 설명

### 주요 기능

1. **`visualize_pca_with_rewards()`**: 
   - Representation을 PCA로 2D 축소하고 reward로 colormap
   - Score layer의 weight 벡터를 빨간 화살표로 표시
   - Weight 방향이 높은 reward로 가는 방향을 나타냄

2. **`plot_reward_distribution()`**: 
   - Reward 값들의 분포를 히스토그램으로 시각화

3. **`analyze_weight_correlation()`**: 
   - Weight 방향으로의 projection과 실제 reward의 상관관계 분석
   - 이론적으로는 완벽한 선형 관계여야 하나, 실제로는 bias나 비선형성 등의 영향이 있을 수 있음

### 해석

- **빨간 화살표**: Score layer가 어떤 방향을 선호하는지 표시
- **Color**: Reward 값 (노란색에 가까울수록 높은 reward)
- **화살표 방향의 점들**: 일반적으로 더 높은 reward를 받는 경향