In [None]:
# Color Primary Prediction - Exploratory Data Analysis
# ====================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

print("üé® Color Primary Prediction - EDA Notebook")
print("="*60)

In [None]:
# Load raw data
print("üì• Loading raw dataset...")
df_raw = pd.read_csv('data/raw/colors.csv')
print(f"Raw dataset shape: {df_raw.shape}")
print(f"Columns: {df_raw.columns.tolist()}")

print("\nüîç First 5 rows of raw data:")
display(df_raw.head())

print("\nüìã Dataset Info:")
print(df_raw.info())

print("\nüìà Basic Statistics:")
display(df_raw[['red', 'green', 'blue']].describe())
# STD = Standard Deviation

print("\nüßπ Missing Values:")
print(df_raw.isnull().sum())

print("\nüîÑ Duplicate Check:")
print(f"Duplicate rows: {df_raw.duplicated().sum()}")
print(f"Duplicate color names: {df_raw['name'].duplicated().sum()}")

In [None]:
# Correlation matrix (KEEP - shows relationships)
corr_matrix = df_raw[['red', 'green', 'blue']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('RGB Correlation Matrix')
plt.show()

In [None]:
# Load labeled data
df_labeled = pd.read_csv('data/processed/colors_clean.csv')
print(f"Labeled dataset shape: {df_labeled.shape}")
print(f"Classes: {df_labeled['primary_label'].unique().tolist()}")

# Class distribution
label_counts = df_labeled['primary_label'].value_counts()
print(f"\nClass Distribution (14.6:1 imbalance):")
print(label_counts)

# Class distribution bar chart
plt.figure(figsize=(10, 6))
plt.bar(label_counts.index, label_counts.values, color='skyblue')
plt.title('Class Distribution (14.6:1 Imbalance)')
plt.xticks(rotation=45)
plt.show()


In [None]:
# pca plot
from sklearn.decomposition import PCA
X = df_labeled[['red', 'green', 'blue']].values
y = df_labeled['primary_label'].values

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=pd.factorize(y)[0], alpha=0.6)
plt.title('PCA: Classes are Well-Separated')
plt.show()

In [None]:

# Final summary 
print("\nüìã EDA SUMMARY:")
print(f"‚Ä¢ Samples: {len(df_labeled):,}")
print(f"‚Ä¢ Classes: {len(df_labeled['primary_label'].unique())}")

# Calculate REAL imbalance ratio
max_count = label_counts.max()
min_count = label_counts.min()
imbalance_ratio = max_count / min_count

print(f"‚Ä¢ Imbalance: {imbalance_ratio:.1f}:1 ratio")
print(f"  - {label_counts.idxmax()}: {max_count/len(df_labeled)*100:.1f}%")
print(f"  - {label_counts.idxmin()}: {min_count/len(df_labeled)*100:.1f}%")

print(f"‚Ä¢ PCA shows good separability ‚Üí Models should work well")
print(f"‚Ä¢ Need class_weight='balanced' for classification models")

In [None]:
# Outlier Check for Models
print("Outlier Check for Models")

# 1. Data Range
print("\n1. Data Range:")
for col in ['red', 'green', 'blue']:
    print(f"   {col}: {df_labeled[col].min():.3f} to {df_labeled[col].max():.3f}")

if df_labeled['red'].max() <= 1:
    print("\n‚úÖ Data is scaled 0-1 (good for all models)")

# 2. Extreme Values
print("\n2. Extreme Values:")
total_extreme = 0
for col in ['red', 'green', 'blue']:
    count = len(df_labeled[(df_labeled[col] < 0.01) | (df_labeled[col] > 0.99)])
    total_extreme += count
    print(f"   {col}: {count} values <0.01 or >0.99")

print(f"\n   Total: {total_extreme} extreme values")
print(f"   Percentage: {(total_extreme/len(df_labeled)*100):.1f}%")

# 3. Recommendation
print("\n3. Models Recommendation:")

if total_extreme > 0:
    print("   KNN: Remove extremes")
else:
    print("   KNN: No extremes found")
