In [None]:
# Color Primary Prediction - Exploratory Data Analysis
# ====================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("üé® Color Primary Prediction - EDA Notebook")
print("="*60)

In [None]:
# Load raw data
print("üì• Loading raw dataset...")
df_raw = pd.read_csv('data/raw/colors.csv')
print(f"Raw dataset shape: {df_raw.shape}")
print(f"Columns: {df_raw.columns.tolist()}")

print("\nüîç First 5 rows of raw data:")
display(df_raw.head())

print("\nüìã Dataset Info:")
print(df_raw.info())

print("\nüìà Basic Statistics:")
display(df_raw[['red', 'green', 'blue']].describe())

print("\nüßπ Missing Values:")
print(df_raw.isnull().sum())

print("\nüîÑ Duplicate Check:")
print(f"Duplicate rows: {df_raw.duplicated().sum()}")
print(f"Duplicate color names: {df_raw['name'].duplicated().sum()}")

In [None]:
# Plot RGB distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

colors = ['red', 'green', 'blue']
titles = ['Red Channel Distribution', 'Green Channel Distribution', 'Blue Channel Distribution']

for idx, (color, title) in enumerate(zip(colors, titles)):
    axes[idx].hist(df_raw[color], bins=50, color=color, alpha=0.7, edgecolor='black')
    axes[idx].set_title(title, fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Value (0-255)')
    axes[idx].set_ylabel('Frequency')
    axes[idx].axvline(df_raw[color].mean(), color='black', linestyle='--', 
                     label=f'Mean: {df_raw[color].mean():.1f}')
    axes[idx].legend()

plt.suptitle('RGB Channel Distributions in Raw Data', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("üìä RGB Value Ranges:")
print(f"Red:   {df_raw['red'].min()} - {df_raw['red'].max()}")
print(f"Green: {df_raw['green'].min()} - {df_raw['green'].max()}")
print(f"Blue:  {df_raw['blue'].min()} - {df_raw['blue'].max()}")

In [None]:
# 3D scatter plot of RGB space (sampled for performance)
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

# Sample for faster plotting
sample_size = min(1000, len(df_raw))
df_sample = df_raw.sample(sample_size, random_state=42)

ax.scatter(df_sample['red'], df_sample['green'], df_sample['blue'], 
           c=df_sample[['red', 'green', 'blue']]/255, alpha=0.6, s=20)
ax.set_xlabel('Red', fontsize=12, fontweight='bold')
ax.set_ylabel('Green', fontsize=12, fontweight='bold')
ax.set_zlabel('Blue', fontsize=12, fontweight='bold')
ax.set_title(f'Colors in RGB Space (Sampled: {sample_size} points)', 
             fontsize=14, fontweight='bold')
plt.show()

In [None]:
# Correlation matrix
print("üîó Correlation between RGB channels:")
corr_matrix = df_raw[['red', 'green', 'blue']].corr()
display(corr_matrix)

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix: RGB Channels', fontsize=14, fontweight='bold')
plt.show()

# Pairplot (sampled for performance)
print("\nüìä Pairwise relationships (sampled):")
df_sample_pair = df_raw.sample(min(500, len(df_raw)), random_state=42)
sns.pairplot(df_sample_pair, vars=['red', 'green', 'blue'], 
             plot_kws={'alpha': 0.6, 's': 20})
plt.suptitle('Pairwise Relationships of RGB Values', y=1.02, fontsize=14, fontweight='bold')
plt.show()

In [None]:
# Load labeled data
print("\n" + "="*60)
print("üè∑Ô∏è  ANALYZING LABELED DATA")
print("="*60)

df_labeled = pd.read_csv('data/processed/colors_with_labels.csv')
print(f"Labeled dataset shape: {df_labeled.shape}")
print(f"New columns: {df_labeled.columns.tolist()}")

print("\nüîç First 5 rows of labeled data:")
display(df_labeled.head())

# Class distribution
print("\nüìä Class Distribution Analysis:")
label_counts = df_labeled['primary_label'].value_counts()
print(f"Total classes: {len(label_counts)}")
print(f"Class distribution:")

for label, count in label_counts.items():
    percentage = count/len(df_labeled)*100
    print(f"  {label:20}: {count:5} samples ({percentage:.1f}%)")

# Visualize class distribution
plt.figure(figsize=(12, 6))
bars = plt.bar(label_counts.index, label_counts.values, 
               color=sns.color_palette("husl", len(label_counts)))
plt.title('Distribution of Primary Color Labels', fontsize=14, fontweight='bold')
plt.xlabel('Primary Color Combination', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Add count labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 5,
             f'{int(height)}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Create color swatches for each class
print("\nüé® Visualizing Sample Colors from Each Class:")

classes = df_labeled['primary_label'].unique()
n_classes = len(classes)

fig, axes = plt.subplots(2, (n_classes + 1)//2, figsize=(16, 8))
axes = axes.flatten()

for idx, class_name in enumerate(classes):
    if idx >= len(axes):
        break
    
    # Get a sample color from this class
    class_samples = df_labeled[df_labeled['primary_label'] == class_name]
    if len(class_samples) > 0:
        sample = class_samples.iloc[0]
        
        # Create color swatch
        color = (sample['red']/255, sample['green']/255, sample['blue']/255)
        axes[idx].add_patch(plt.Rectangle((0, 0), 1, 1, color=color))
        axes[idx].set_title(f"{class_name}\nRGB: {sample['red']},{sample['green']},{sample['blue']}",
                           fontsize=10)
        axes[idx].set_xlim(0, 1)
        axes[idx].set_ylim(0, 1)
        axes[idx].axis('off')
    else:
        axes[idx].axis('off')

# Hide any unused subplots
for idx in range(len(classes), len(axes)):
    axes[idx].axis('off')

plt.suptitle('Sample Colors from Each Primary Color Class', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# PCA to check class separability
print("\nüîç Checking Class Separability with PCA:")

from sklearn.decomposition import PCA

X = df_labeled[['red', 'green', 'blue']].values
y = df_labeled['primary_label'].values

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot PCA results
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], 
                      c=pd.factorize(y)[0], cmap='tab10', alpha=0.6, s=30)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)', fontsize=12)
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)', fontsize=12)
plt.title('PCA of RGB Colors Colored by Primary Label', fontsize=14, fontweight='bold')
plt.colorbar(scatter, label='Class')
plt.grid(True, alpha=0.3)
plt.show()

print(f"PCA Explained Variance:")
print(f"  PC1: {pca.explained_variance_ratio_[0]:.2%}")
print(f"  PC2: {pca.explained_variance_ratio_[1]:.2%}")
print(f"  Total: {sum(pca.explained_variance_ratio_):.2%}")

print("\nüí° Interpretation:")
print("  - If classes are well-separated in PCA plot, ML models should work well")
print("  - Overlapping classes might be harder to classify")

In [None]:
# Final summary
print("\n" + "="*60)
print("üìã EDA SUMMARY")
print("="*60)

print(f"\nüìä Dataset Statistics:")
print(f"  Total samples: {len(df_labeled)}")
print(f"  Number of classes: {len(df_labeled['primary_label'].unique())}")
print(f"  Class distribution:")
for label, count in label_counts.items():
    print(f"    {label:20}: {count/len(df_labeled)*100:5.1f}%")

print(f"\nüé® RGB Statistics:")
for channel in ['red', 'green', 'blue']:
    mean_val = df_labeled[channel].mean()
    std_val = df_labeled[channel].std()
    print(f"  {channel:6}: Mean = {mean_val:6.1f}, Std = {std_val:6.1f}, Range = {df_labeled[channel].min():3.0f}-{df_labeled[channel].max():3.0f}")

print(f"\nüîó Correlations:")
print(f"  Red-Green:   {df_labeled['red'].corr(df_labeled['green']):.3f}")
print(f"  Red-Blue:    {df_labeled['red'].corr(df_labeled['blue']):.3f}")
print(f"  Green-Blue:  {df_labeled['green'].corr(df_labeled['blue']):.3f}")

print(f"\nüí° Key Insights:")
print(f"  1. Dataset has {len(df_labeled)} unique colors")
print(f"  2. {len(label_counts)} primary color combinations identified")
print(f"  3. Class distribution is {'' if label_counts.std()/label_counts.mean() < 0.5 else 'not '}balanced")
print(f"  4. PCA shows {'good' if sum(pca.explained_variance_ratio_) > 0.8 else 'moderate'} separability")
print(f"  5. Ready for ML model training!")

print("\n‚úÖ EDA Complete! Proceed to model training.")

In [None]:
# Class imbalance metrics
print("üìä Class Imbalance Analysis:")
print("=" * 40)

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Calculate imbalance ratio
class_counts = df_labeled['primary_label'].value_counts()
max_count = class_counts.max()
min_count = class_counts.min()

print(f"Most frequent class: {class_counts.idxmax()} ({max_count} samples)")
print(f"Least frequent class: {class_counts.idxmin()} ({min_count} samples)")
print(f"Imbalance ratio: {max_count/min_count:.2f}:1")
print(f"Minority class %: {min_count/len(df_labeled)*100:.2f}%")

# FIXED: Calculate class weights correctly
# Use LabelEncoder instead of pd.factorize for consistency
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df_labeled['primary_label'])

# Get all unique class indices (0 through n_classes-1)
classes_indices = np.arange(len(label_encoder.classes_))

# Now compute class weights
try:
    class_weights = compute_class_weight('balanced', classes=classes_indices, y=y_encoded)
    
    print("\nüìà Suggested class weights for models (from sklearn):")
    for i, (class_name, weight) in enumerate(zip(label_encoder.classes_, class_weights)):
        print(f"  {class_name:20}: {weight:.3f}")
        
except ValueError as e:
    print(f"\n‚ö†Ô∏è  Error computing class weights: {e}")
    print("Using manual calculation instead...")
    
    # Manual calculation
    total_samples = len(df_labeled)
    n_classes = len(class_counts)
    
    print("\nüìä Manual Class Weight Calculation:")
    print("Class                  | Samples | Weight = Total/(Classes * Samples)")
    print("-" * 60)
    
    for class_name, count in class_counts.items():
        manual_weight = total_samples / (n_classes * count)
        percentage = count/total_samples*100
        print(f"{class_name:20} | {count:7} | {manual_weight:.3f} ({percentage:.1f}%)")

# Alternative: Always use manual calculation (more reliable)
print("\n" + "="*60)
print("üìä RELIABLE CLASS WEIGHT CALCULATION")
print("="*60)

total_samples = len(df_labeled)
n_classes = len(class_counts)

print("Formula: weight_for_class = total_samples / (n_classes √ó samples_in_class)")
print(f"Total samples: {total_samples}, Number of classes: {n_classes}")
print("\nCalculated weights:")
for class_name, count in class_counts.items():
    weight = total_samples / (n_classes * count)
    percentage = count/total_samples*100
    print(f"  {class_name:20}: {count:6} samples ‚Üí weight = {weight:.3f} ({percentage:.1f}% of data)")

# Calculate what class_weight='balanced' actually does
print("\n‚öñÔ∏è  What class_weight='balanced' does in sklearn:")
print("It gives higher weights to minority classes to balance their influence")
print("\nWeight ratios (compared to majority class):")
majority_weight = total_samples / (n_classes * max_count)
for class_name, count in class_counts.items():
    weight = total_samples / (n_classes * count)
    ratio = weight / majority_weight
    print(f"  {class_name:20}: weight is {ratio:.1f}√ó higher than majority class")

# Visualize imbalance with pie chart
plt.figure(figsize=(10, 10))
wedges, texts, autotexts = plt.pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%',
        colors=sns.color_palette("husl", len(class_counts)))
plt.title(f'Class Distribution (Imbalance: {max_count/min_count:.1f}:1)', fontsize=16, fontweight='bold')

# Make the pie chart more readable
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')

plt.show()

# Show impact of resampling
print("\nüîß Resampling Strategies Preview:")
print("-" * 40)

# Simulate balanced dataset
target_samples = 2000  # Target samples per class
print(f"With balanced resampling to {target_samples} samples per class:")
for label in class_counts.index:
    current = class_counts[label]
    needed = target_samples - current
    if needed > 0:
        print(f"  {label:20}: {current:4} ‚Üí {target_samples:4} (OVERSAMPLE +{needed:,})")
    elif needed < 0:
        print(f"  {label:20}: {current:4} ‚Üí {target_samples:4} (UNDERSAMPLE {needed:,})")
    else:
        print(f"  {label:20}: {current:4} ‚Üí {target_samples:4} (PERFECT)")

# Show class weights bar chart
print("\nüìà Visualizing Class Weights:")
plt.figure(figsize=(12, 6))

# Sort classes by weight (highest to lowest)
sorted_weights = sorted([(name, total_samples / (n_classes * class_counts[name])) 
                        for name in class_counts.index], 
                       key=lambda x: x[1], reverse=True)
class_names_sorted = [x[0] for x in sorted_weights]
weights_sorted = [x[1] for x in sorted_weights]

bars = plt.bar(class_names_sorted, weights_sorted, 
               color=sns.color_palette("husl", len(class_names_sorted)))

plt.xlabel('Class', fontsize=12)
plt.ylabel('Class Weight', fontsize=12)
plt.title('Class Weights for Balanced Training\n(higher weight = more attention to minority class)', 
          fontsize=14, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')

# Add weight values on bars
for bar, weight in zip(bars, weights_sorted):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
             f'{weight:.2f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

# Final summary
print("\n" + "="*60)
print("üìã IMBALANCE HANDLING SUMMARY")
print("="*60)
print(f"Imbalance ratio: {max_count/min_count:.1f}:1")
print(f"Majority class ({class_counts.idxmax()}): {max_count:,} samples")
print(f"Minority class ({class_counts.idxmin()}): {min_count:,} samples")
print(f"\nFor class_weight='balanced':")
print(f"  ‚Ä¢ Minority class gets {max_count/min_count:.1f}√ó more weight")
print(f"  ‚Ä¢ This forces model to pay more attention to rare classes")
print(f"  ‚Ä¢ Prevents model from ignoring minority classes")
print(f"\nModels using class_weight='balanced': Decision Tree, Random Forest")
print(f"Models without imbalance handling: KNN, Naive Bayes")

In [None]:
# ============================================
# PRESENTATION-READY VISUALIZATION
# ============================================

print("\n" + "="*70)
print("üéØ PRESENTATION-READY VISUALIZATION")
print("="*70)

import matplotlib.pyplot as plt
import numpy as np

# Data from your analysis
classes = ['Blue+Red', 'Red', 'Blue', 'Red+Blue+Yellow', 'Red+Yellow']
samples = [22635, 12465, 5726, 2175, 1546]
weights = [0.394, 0.715, 1.556, 4.096, 5.763]
percentages = [50.8, 28.0, 12.9, 4.9, 3.5]
weight_ratios = [1.0, 1.8, 4.0, 10.4, 14.6]  # Compared to Blue+Red

# Create a professional figure
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Class Distribution (Pie Chart)
ax1 = axes[0, 0]
wedges, texts, autotexts = ax1.pie(samples, labels=classes, autopct='%1.1f%%',
                                   colors=plt.cm.Set3(np.linspace(0, 1, len(classes))))
ax1.set_title('A. Class Distribution in Dataset\n(14.6:1 Imbalance Ratio)', 
             fontsize=14, fontweight='bold', pad=20)
for autotext in autotexts:
    autotext.set_color('black')
    autotext.set_fontweight('bold')

# Plot 2: Class Weights (Bar Chart)
ax2 = axes[0, 1]
x_pos = np.arange(len(classes))
bars = ax2.bar(x_pos, weights, color=plt.cm.viridis(np.linspace(0.2, 0.8, len(classes))))
ax2.set_title('B. Class Weights for Balanced Training\n(class_weight="balanced")', 
             fontsize=14, fontweight='bold')
ax2.set_xlabel('Class', fontsize=12)
ax2.set_ylabel('Weight', fontsize=12)
ax2.set_xticks(x_pos)
ax2.set_xticklabels(classes, rotation=45, ha='right')
ax2.grid(True, alpha=0.3, axis='y')

# Add weight values on bars
for bar, weight in zip(bars, weights):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 0.1,
             f'{weight:.2f}', ha='center', va='bottom', fontsize=10)

# Plot 3: Weight Ratio (How many times more important)
ax3 = axes[1, 0]
colors = ['#ff6b6b' if ratio > 10 else '#4ecdc4' if ratio > 4 else '#45b7d1' for ratio in weight_ratios]
bars3 = ax3.bar(x_pos, weight_ratios, color=colors)
ax3.set_title('C. Attention Multiplier vs Majority Class\n(Red+Yellow gets 14.6√ó more attention)', 
             fontsize=14, fontweight='bold')
ax3.set_xlabel('Class', fontsize=12)
ax3.set_ylabel('Times More Important', fontsize=12)
ax3.set_xticks(x_pos)
ax3.set_xticklabels(classes, rotation=45, ha='right')
ax3.grid(True, alpha=0.3, axis='y')
ax3.axhline(y=1, color='red', linestyle='--', alpha=0.5, label='Majority class baseline')

# Add ratio values
for bar, ratio in zip(bars3, weight_ratios):
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height + 0.2,
             f'{ratio:.1f}√ó', ha='center', va='bottom', fontsize=10, fontweight='bold')

# Plot 4: Model Performance Comparison
ax4 = axes[1, 1]
models = ['KNN\n(unbalanced)', 'Naive Bayes\n(unbalanced)', 'Decision Tree\n(balanced)', 'Random Forest\n(balanced)']
clean_acc = [0.9848, 0.7851, 0.9961, 0.9954]  # From your results
noisy_acc = [0.6310, 0.6585, 0.5686, 0.6532]  # 20% noise

x = np.arange(len(models))
width = 0.35

bars4a = ax4.bar(x - width/2, clean_acc, width, label='0% Noise', color='#2ecc71', alpha=0.8)
bars4b = ax4.bar(x + width/2, noisy_acc, width, label='20% Noise', color='#e74c3c', alpha=0.8)

ax4.set_title('D. Model Performance: Clean vs Noisy Data\n(Balanced models handle imbalance better)', 
             fontsize=14, fontweight='bold')
ax4.set_xlabel('Model', fontsize=12)
ax4.set_ylabel('Accuracy', fontsize=12)
ax4.set_xticks(x)
ax4.set_xticklabels(models, fontsize=10)
ax4.set_ylim(0, 1.1)
ax4.legend()
ax4.grid(True, alpha=0.3, axis='y')

# Add accuracy values
for bars in [bars4a, bars4b]:
    for bar in bars:
        height = bar.get_height()
        ax4.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom', fontsize=9)

plt.suptitle('ML Project: Handling Class Imbalance and Noise in Color Classification', 
            fontsize=18, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# ============================================
# PRESENTATION SUMMARY
# ============================================

print("\n" + "="*70)
print("üìã PRESENTATION SUMMARY")
print("="*70)

print("\nüéØ PROBLEM STATEMENT:")
print("   ‚Ä¢ Classify colors into primary color combinations")
print("   ‚Ä¢ Dataset has natural 14.6:1 class imbalance")
print("   ‚Ä¢ Real-world data often has noise/imperfections")

print("\nüí° SOLUTION IMPLEMENTED:")
print("   ‚Ä¢ Used class_weight='balanced' in tree-based models")
print("   ‚Ä¢ Minority classes get 14.6√ó higher weight")
print("   ‚Ä¢ All models tested with 0%, 5%, 10%, 20% Gaussian noise")

print("\nüìä KEY RESULTS:")
print("   1. Tree models with balanced weights: >99.5% accuracy")
print("   2. Perfect F1-scores (1.00) for ALL classes")
print("   3. Naive Bayes: Most noise-robust (16.1% drop at 20% noise)")
print("   4. Decision Tree: Highest accuracy but most noise-sensitive")
print("   5. Random Forest: Best balance of accuracy & robustness")

print("\nüéì EDUCATIONAL VALUE:")
print("   ‚Ä¢ Demonstrates real-world ML challenges")
print("   ‚Ä¢ Shows importance of handling class imbalance")
print("   ‚Ä¢ Compares different algorithms' strengths/weaknesses")
print("   ‚Ä¢ Provides insights into model selection for real applications")

print("\n" + "="*70)
print("‚úÖ PROJECT SUCCESSFULLY DEMONSTRATES:")
print("   - Proper ML practices for imbalanced data")
print("   - Noise robustness analysis")
print("   - Algorithm comparison under realistic conditions")
print("   - Effective visualization of technical concepts")
print("="*70)

In [None]:
# ====================================================
# LINEAR REGRESSION EDA - RYB RATIO PREDICTION
# ====================================================

print("\n" + "="*60)
print("üìä LINEAR REGRESSION EDA - RYB Ratio Prediction")
print("="*60)

# Load data for regression analysis
print("\nüì• Loading data for regression analysis...")
df_regression = pd.read_csv('data/raw/colors.csv')

# Create RYB ratio targets using your conversion function
def rgb_to_ryb_ratios(r, g, b):
    """
    Convert RGB to estimated RYB ratios.
    Returns: [red_ratio, yellow_ratio, blue_ratio]
    """
    # Normalize RGB
    r_norm, g_norm, b_norm = r/255.0, g/255.0, b/255.0
    
    # Remove white component first
    white = min(r_norm, g_norm, b_norm)
    r_prime = r_norm - white
    g_prime = g_norm - white
    b_prime = b_norm - white
    
    # Convert RGB to RYB
    red_component = r_prime
    yellow_component = min(r_prime, g_prime)  # Yellow comes from Red+Green
    blue_component = b_prime
    
    # For colors like gray (equal RGB), ensure all components are present
    if white > 0.5:  # Light colors
        red_component += white * 0.33
        yellow_component += white * 0.33
        blue_component += white * 0.33
    
    total = red_component + yellow_component + blue_component + 1e-10
    
    # Normalize to sum to 1
    return [
        red_component / total,
        yellow_component / total,
        blue_component / total
    ]

# Calculate RYB ratios for all colors
print("üîÑ Calculating RYB ratios for regression targets...")
ryb_ratios = df_regression.apply(
    lambda row: rgb_to_ryb_ratios(row['red'], row['green'], row['blue']), 
    axis=1
)

df_regression['red_ratio'] = [r[0] for r in ryb_ratios]
df_regression['yellow_ratio'] = [r[1] for r in ryb_ratios]
df_regression['blue_ratio'] = [r[2] for r in ryb_ratios]
df_regression['ratio_sum'] = df_regression[['red_ratio', 'yellow_ratio', 'blue_ratio']].sum(axis=1)

print(f"\n‚úÖ Added RYB ratio columns to {len(df_regression)} samples")

# Display sample conversions
print("\nüîç Sample RGB to RYB Conversions:")
print("-" * 50)
sample_colors = [
    ([255, 0, 0], "Pure Red"),
    ([255, 255, 0], "Yellow"),
    ([0, 0, 255], "Pure Blue"),
    ([255, 0, 255], "Magenta"),
    ([0, 255, 0], "Green"),
    ([128, 128, 128], "Gray"),
    ([255, 128, 0], "Orange"),
    ([128, 0, 128], "Purple")
]

for rgb, name in sample_colors:
    ratios = rgb_to_ryb_ratios(rgb[0], rgb[1], rgb[2])
    print(f"{name:20} RGB{rgb}")
    print(f"  ‚Üí RYB: [{ratios[0]:.3f}, {ratios[1]:.3f}, {ratios[2]:.3f}]")
    print(f"  ‚Üí Sum: {sum(ratios):.3f}")
    primary = []
    if ratios[0] > 0.3: primary.append(f"Red({ratios[0]:.0%})")
    if ratios[1] > 0.3: primary.append(f"Yellow({ratios[1]:.0%})")
    if ratios[2] > 0.3: primary.append(f"Blue({ratios[2]:.0%})")
    if primary:
        print(f"  ‚Üí Primary: {' + '.join(primary)}")
    else:
        print(f"  ‚Üí Mixed colors (no dominant primary)")
    print()

# Distribution of RYB ratios
print("\nüìä Distribution of RYB Ratios:")
print("-" * 40)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

ratios = ['red_ratio', 'yellow_ratio', 'blue_ratio']
colors = ['red', 'gold', 'blue']
titles = ['Red Ratio Distribution', 'Yellow Ratio Distribution', 'Blue Ratio Distribution']

for idx, (ratio, color, title) in enumerate(zip(ratios, colors, titles)):
    axes[idx].hist(df_regression[ratio], bins=50, color=color, alpha=0.7, edgecolor='black')
    axes[idx].set_title(title, fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Ratio (0-1)')
    axes[idx].set_ylabel('Frequency')
    axes[idx].axvline(df_regression[ratio].mean(), color='black', linestyle='--', 
                     label=f'Mean: {df_regression[ratio].mean():.3f}')
    axes[idx].axvline(df_regression[ratio].median(), color='green', linestyle='--', 
                     label=f'Median: {df_regression[ratio].median():.3f}')
    axes[idx].legend(fontsize=9)

plt.suptitle('Distribution of RYB Ratios in Dataset', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Ratio statistics
print("\nüìà RYB Ratio Statistics:")
print("="*50)
for ratio, color in zip(ratios, ['Red', 'Yellow', 'Blue']):
    data = df_regression[ratio]
    print(f"\n{color} Ratio:")
    print(f"  Mean:    {data.mean():.4f}")
    print(f"  Median:  {data.median():.4f}")
    print(f"  Std Dev: {data.std():.4f}")
    print(f"  Min:     {data.min():.4f}")
    print(f"  Max:     {data.max():.4f}")
    print(f"  Zero %:  {(data == 0).sum() / len(data) * 100:.1f}%")
    print(f"  >0.5 %:  {(data > 0.5).sum() / len(data) * 100:.1f}%")

# Check ratio sums
print(f"\n‚úÖ Ratio Sum Validation:")
print(f"  Mean sum: {df_regression['ratio_sum'].mean():.6f}")
print(f"  Std sum:  {df_regression['ratio_sum'].std():.6f}")
print(f"  Min sum:  {df_regression['ratio_sum'].min():.6f}")
print(f"  Max sum:  {df_regression['ratio_sum'].max():.6f}")
print(f"  % within 0.99-1.01: {((df_regression['ratio_sum'] >= 0.99) & (df_regression['ratio_sum'] <= 1.01)).sum() / len(df_regression) * 100:.1f}%")

# Correlation between RGB inputs and RYB targets
print("\nüîó Correlation Analysis:")
print("-" * 40)

# Correlation matrix: RGB inputs vs RYB outputs
corr_matrix = df_regression[['red', 'green', 'blue', 'red_ratio', 'yellow_ratio', 'blue_ratio']].corr()
input_output_corr = corr_matrix.loc[['red', 'green', 'blue'], ['red_ratio', 'yellow_ratio', 'blue_ratio']]

plt.figure(figsize=(10, 8))
sns.heatmap(input_output_corr, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": .8}, fmt='.3f')
plt.title('Correlation: RGB Inputs vs RYB Ratio Targets', fontsize=14, fontweight='bold')
plt.show()

print("\nüìä Correlation Insights:")
print("  Red Ratio is strongly correlated with:")
print(f"    ‚Ä¢ Red input:   {input_output_corr.loc['red', 'red_ratio']:.3f}")
print(f"    ‚Ä¢ Green input: {input_output_corr.loc['green', 'red_ratio']:.3f}")
print(f"    ‚Ä¢ Blue input:  {input_output_corr.loc['blue', 'red_ratio']:.3f}")

print("\n  Yellow Ratio is strongly correlated with:")
print(f"    ‚Ä¢ Red input:   {input_output_corr.loc['red', 'yellow_ratio']:.3f}")
print(f"    ‚Ä¢ Green input: {input_output_corr.loc['green', 'yellow_ratio']:.3f}")
print(f"    ‚Ä¢ Blue input:  {input_output_corr.loc['blue', 'yellow_ratio']:.3f}")

print("\n  Blue Ratio is strongly correlated with:")
print(f"    ‚Ä¢ Red input:   {input_output_corr.loc['red', 'blue_ratio']:.3f}")
print(f"    ‚Ä¢ Green input: {input_output_corr.loc['green', 'blue_ratio']:.3f}")
print(f"    ‚Ä¢ Blue input:  {input_output_corr.loc['blue', 'blue_ratio']:.3f}")

# Visualize relationship between RGB and RYB ratios
print("\nüìà Visualizing RGB to RYB Relationships:")
print("-" * 40)

fig, axes = plt.subplots(3, 3, figsize=(15, 12))

input_channels = ['red', 'green', 'blue']
output_channels = ['red_ratio', 'yellow_ratio', 'blue_ratio']
output_names = ['Red Ratio', 'Yellow Ratio', 'Blue Ratio']

for i, input_channel in enumerate(input_channels):
    for j, (output_channel, output_name) in enumerate(zip(output_channels, output_names)):
        axes[i, j].scatter(df_regression[input_channel], df_regression[output_channel], 
                          alpha=0.1, s=10, color=colors[j])
        axes[i, j].set_xlabel(f'{input_channel.capitalize()} Input', fontsize=10)
        axes[i, j].set_ylabel(output_name, fontsize=10)
        axes[i, j].grid(True, alpha=0.3)
        
        # Calculate correlation
        corr = df_regression[input_channel].corr(df_regression[output_channel])
        axes[i, j].set_title(f'{input_channel.capitalize()} vs {output_name}\nCorr: {corr:.3f}', 
                            fontsize=11, fontweight='bold')

plt.suptitle('Relationship Between RGB Inputs and RYB Ratio Targets', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Distribution of ratio combinations
print("\nüé® Distribution of Primary Color Dominance:")
print("-" * 40)

# Determine dominant primary for each color
def get_dominant_primary(row):
    ratios = [row['red_ratio'], row['yellow_ratio'], row['blue_ratio']]
    max_ratio = max(ratios)
    primaries = []
    if row['red_ratio'] == max_ratio: primaries.append('Red')
    if row['yellow_ratio'] == max_ratio: primaries.append('Yellow')
    if row['blue_ratio'] == max_ratio: primaries.append('Blue')
    return '+'.join(primaries) if primaries else 'Equal'

df_regression['dominant_primary'] = df_regression.apply(get_dominant_primary, axis=1)

# Count dominant primaries
dominant_counts = df_regression['dominant_primary'].value_counts()

plt.figure(figsize=(10, 6))
bars = plt.bar(dominant_counts.index, dominant_counts.values, 
               color=sns.color_palette("husl", len(dominant_counts)))
plt.title('Most Dominant Primary Color in Each Sample', fontsize=14, fontweight='bold')
plt.xlabel('Dominant Primary(s)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Add count labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 5,
             f'{int(height)}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

print("\nüìä Dominant Primary Statistics:")
for primary, count in dominant_counts.items():
    percentage = count / len(df_regression) * 100
    print(f"  {primary:15}: {count:6} samples ({percentage:.1f}%)")

# Check for linear separability in ratio space
print("\nüîç Checking Linear Separability in Ratio Space:")
print("-" * 40)

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# Prepare data for linear regression
X_ratio = df_regression[['red', 'green', 'blue']].values
y_red = df_regression['red_ratio'].values
y_yellow = df_regression['yellow_ratio'].values
y_blue = df_regression['blue_ratio'].values

# Cross-validation for each ratio
print("\nüìà Cross-Validation R¬≤ Scores for Individual Linear Models:")
print("  (Higher scores indicate better linear fit)")

for y_data, name in zip([y_red, y_yellow, y_blue], ['Red Ratio', 'Yellow Ratio', 'Blue Ratio']):
    model = LinearRegression()
    scores = cross_val_score(model, X_ratio, y_data, cv=5, scoring='r2')
    print(f"\n  {name}:")
    print(f"    Mean R¬≤: {scores.mean():.4f}")
    print(f"    Std R¬≤:  {scores.std():.4f}")
    print(f"    Range:   [{scores.min():.4f}, {scores.max():.4f}]")
    
    # Interpretation
    if scores.mean() > 0.8:
        print(f"    ‚úÖ Excellent linear relationship")
    elif scores.mean() > 0.6:
        print(f"    üëç Good linear relationship")
    elif scores.mean() > 0.4:
        print(f"    üìä Moderate linear relationship")
    else:
        print(f"    ‚ö†Ô∏è  Weak linear relationship")

# 3D visualization of RGB to RYB mapping
print("\nüé® 3D Visualization: RGB Space Colored by RYB Ratios")
print("-" * 40)

fig = plt.figure(figsize=(15, 5))

# Sample for faster plotting
sample_size = min(500, len(df_regression))
df_sample = df_regression.sample(sample_size, random_state=42)

# Plot 1: Colored by Red Ratio
ax1 = fig.add_subplot(131, projection='3d')
sc1 = ax1.scatter(df_sample['red'], df_sample['green'], df_sample['blue'], 
                  c=df_sample['red_ratio'], cmap='Reds', alpha=0.6, s=20)
ax1.set_xlabel('Red')
ax1.set_ylabel('Green')
ax1.set_zlabel('Blue')
ax1.set_title('Colored by Red Ratio', fontweight='bold')
plt.colorbar(sc1, ax=ax1, shrink=0.5)

# Plot 2: Colored by Yellow Ratio
ax2 = fig.add_subplot(132, projection='3d')
sc2 = ax2.scatter(df_sample['red'], df_sample['green'], df_sample['blue'], 
                  c=df_sample['yellow_ratio'], cmap='YlOrBr', alpha=0.6, s=20)
ax2.set_xlabel('Red')
ax2.set_ylabel('Green')
ax2.set_zlabel('Blue')
ax2.set_title('Colored by Yellow Ratio', fontweight='bold')
plt.colorbar(sc2, ax=ax2, shrink=0.5)

# Plot 3: Colored by Blue Ratio
ax3 = fig.add_subplot(133, projection='3d')
sc3 = ax3.scatter(df_sample['red'], df_sample['green'], df_sample['blue'], 
                  c=df_sample['blue_ratio'], cmap='Blues', alpha=0.6, s=20)
ax3.set_xlabel('Red')
ax3.set_ylabel('Green')
ax3.set_zlabel('Blue')
ax3.set_title('Colored by Blue Ratio', fontweight='bold')
plt.colorbar(sc3, ax=ax3, shrink=0.5)

plt.suptitle(f'RGB Space Colored by RYB Ratios (Sample: {sample_size} points)', 
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Final Regression EDA Summary
print("\n" + "="*60)
print("üìã LINEAR REGRESSION EDA SUMMARY")
print("="*60)

print(f"\nüìä Dataset Statistics for Regression:")
print(f"  Total samples: {len(df_regression):,}")
print(f"  Input features: 3 (Red, Green, Blue)")
print(f"  Output targets: 3 (Red Ratio, Yellow Ratio, Blue Ratio)")

print(f"\nüé® RYB Ratio Characteristics:")
print(f"  All ratios sum to approximately 1.0")
print(f"  Ratio range: 0.0 to 1.0 (continuous values)")
print(f"  Distribution: Varies by channel (see histograms)")

print(f"\nüîó Correlation Insights:")
print(f"  Strongest correlations found between:")
print(f"    ‚Ä¢ Red input ‚Üî Red ratio")
print(f"    ‚Ä¢ Green input ‚Üî Yellow ratio")
print(f"    ‚Ä¢ Blue input ‚Üî Blue ratio")

print(f"\nüìà Linear Regression Suitability:")
print(f"  Cross-validation R¬≤ scores indicate:")
print(f"    ‚Ä¢ Red ratio: Good linear fit expected")
print(f"    ‚Ä¢ Yellow ratio: Moderate linear fit expected")
print(f"    ‚Ä¢ Blue ratio: Good linear fit expected")

print(f"\nüí° Key Challenges for Linear Regression:")
print(f"  1. Non-linear RGB to RYB conversion")
print(f"  2. Ratios must sum to 1 (constrained outputs)")
print(f"  3. Some colors have equal ratios (gray, white, black)")

print(f"\nüéØ Expected Performance:")
print(f"  ‚Ä¢ R¬≤ > 0.8: Excellent predictive power")
print(f"  ‚Ä¢ R¬≤ 0.6-0.8: Good performance")
print(f"  ‚Ä¢ R¬≤ < 0.6: Consider non-linear models")

print(f"\nüîß Recommendations:")
print(f"  1. Use MultiOutputRegressor for 3 outputs")
print(f"  2. Consider Ridge regression for regularization")
print(f"  3. Scale RGB inputs (0-255 range)")
print(f"  4. Post-process to ensure ratios sum to 1")

print(f"\n‚úÖ Regression EDA Complete! Ready for model training.")
print("="*60)