In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import tensorflow as tf

In [2]:
# Load the dataset
df = pd.read_csv("../../Datasets/colors.csv")
df

Unnamed: 0,Name_Small,Name,HEX,R,G,B
0,air_force_blue_raf,Air Force Blue (Raf),#5d8aa8,93,138,168
1,air_force_blue_usaf,Air Force Blue (Usaf),#00308f,0,48,143
2,air_superiority_blue,Air Superiority Blue,#72a0c1,114,160,193
3,alabama_crimson,Alabama Crimson,#a32638,163,38,56
4,alice_blue,Alice Blue,#f0f8ff,240,248,255
...,...,...,...,...,...,...
860,yellow_orange,Yellow Orange,#ffae42,255,174,66
861,yellow_process,Yellow (Process),#ffef00,255,239,0
862,yellow_ryb,Yellow (Ryb),#fefe33,254,254,51
863,zaffre,Zaffre,#0014a8,0,20,168


In [3]:
X = df[["R", "G", "B"]] / 255.0
y = df["Name"].values
X.duplicated().sum()

100

In [4]:
# Encode the labels
le = LabelEncoder()
y_encoded =le.fit_transform(y)
y_onehot = np.eye(len(np.unique(y_encoded)))[y_encoded]
y_onehot

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [5]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((692, 3), (173, 3), (692, 865), (173, 865))

In [6]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, input_dim=3, activation='relu'),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(y_onehot.shape[1], activation="softmax")
])
model.compile(optimizer="adam", loss = "categorical_crossentropy", metrics=["accuracy"])
model.fit(X, y_onehot, epochs=100, batch_size=8, validation_data=[X_test, y_test])

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.0000e+00 - loss: 6.7780 - val_accuracy: 0.0000e+00 - val_loss: 6.7587
Epoch 2/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.0000e+00 - loss: 6.7780 - val_accuracy: 0.0000e+00 - val_loss: 6.7587
Epoch 2/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0034 - loss: 6.7558 - val_accuracy: 0.0000e+00 - val_loss: 6.6818
Epoch 3/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0034 - loss: 6.7558 - val_accuracy: 0.0000e+00 - val_loss: 6.6818
Epoch 3/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0014 - loss: 6.6530 - val_accuracy: 0.0000e+00 - val_loss: 6.3944
Epoch 4/100
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0014 - loss: 6.6530 - val_accuracy: 0.0000e+00 - val_loss: 6.3944
Ep

<keras.src.callbacks.history.History at 0x256f72b2fb0>

In [7]:
# Predict color for RGB values [255, 191, 0] (gold/yellow color)
rgb_input = np.array([[255, 191, 0]]) / 255.0  # Normalize RGB values
prediction = model.predict(rgb_input)
predicted_class = np.argmax(prediction)
predicted_color = le.inverse_transform([predicted_class])[0]

print(f"RGB values: [255, 191, 0]")
print(f"Predicted class index: {predicted_class}")
print(f"Predicted color: {predicted_color}")
print(f"Prediction confidence: {prediction[0][predicted_class]:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
RGB values: [255, 191, 0]
Predicted class index: 337
Predicted color: Golden Poppy
Prediction confidence: 0.2149
RGB values: [255, 191, 0]
Predicted class index: 337
Predicted color: Golden Poppy
Prediction confidence: 0.2149


In [8]:
# Check what Selective Yellow looks like
selective_yellow = df[df['Name'] == 'Selective Yellow']
if not selective_yellow.empty:
    print("Selective Yellow RGB values:")
    print(f"[{selective_yellow['R'].iloc[0]}, {selective_yellow['G'].iloc[0]}, {selective_yellow['B'].iloc[0]}]")
else:
    print("Selective Yellow not found")

# Let's test the model with the exact Amber values from the dataset
amber_test = df[df['Name'] == 'Amber']
if not amber_test.empty:
    amber_rgb = amber_test[['R', 'G', 'B']].values[0] / 255.0
    print(f"\nTesting with exact Amber RGB values: {amber_rgb}")
    
    amber_prediction = model.predict(amber_rgb.reshape(1, -1))
    amber_predicted_class = np.argmax(amber_prediction)
    amber_predicted_color = le.inverse_transform([amber_predicted_class])[0]
    
    print(f"Predicted class: {amber_predicted_class}")
    print(f"Predicted color: {amber_predicted_color}")
    print(f"Confidence: {amber_prediction[0][amber_predicted_class]:.4f}")
    
    # Show top 5 predictions
    top_5_indices = np.argsort(amber_prediction[0])[-5:][::-1]
    print(f"\nTop 5 predictions:")
    for i, idx in enumerate(top_5_indices):
        color_name = le.inverse_transform([idx])[0]
        confidence = amber_prediction[0][idx]
        print(f"{i+1}. {color_name}: {confidence:.4f}")

# Check model accuracy
print(f"\nModel training accuracy from last epoch should be visible above")
print("The model might need more training or different architecture")

Selective Yellow RGB values:
[255, 186, 0]

Testing with exact Amber RGB values: [1.         0.74901961 0.        ]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Predicted class: 337
Predicted color: Golden Poppy
Confidence: 0.2149

Top 5 predictions:
1. Golden Poppy: 0.2149
2. Fluorescent Orange: 0.1672
3. Amber: 0.1437
4. Tangerine Yellow: 0.0785
5. Selective Yellow: 0.0779

Model training accuracy from last epoch should be visible above
The model might need more training or different architecture
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Predicted class: 337
Predicted color: Golden Poppy
Confidence: 0.2149

Top 5 predictions:
1. Golden Poppy: 0.2149
2. Fluorescent Orange: 0.1672
3. Amber: 0.1437
4. Tangerine Yellow: 0.0785
5. Selective Yellow: 0.0779

Model training accuracy from last epoch should be visible above
The model might need more training or different architecture


In [9]:
# Find duplicate RGB values with different names
print("Analyzing duplicate RGB values in the dataset...")
print(f"Total rows in dataset: {len(df)}")

# Group by RGB values and check for duplicates
rgb_groups = df.groupby(['R', 'G', 'B'])['Name'].apply(list).reset_index()
duplicates = rgb_groups[rgb_groups['Name'].apply(len) > 1]

print(f"Number of unique RGB combinations: {len(rgb_groups)}")
print(f"Number of RGB combinations with multiple names: {len(duplicates)}")

if len(duplicates) > 0:
    print(f"\nTotal duplicate entries: {duplicates['Name'].apply(len).sum() - len(duplicates)}")
    print("\nFirst 10 examples of RGB values with multiple names:")
    for i in range(min(10, len(duplicates))):
        rgb = duplicates.iloc[i]
        print(f"RGB [{rgb['R']}, {rgb['G']}, {rgb['B']}]: {rgb['Name']}")

    # Clean the dataset by keeping only the first occurrence of each RGB combination
    print(f"\nCleaning dataset...")
    df_clean = df.drop_duplicates(subset=['R', 'G', 'B'], keep='first')
    print(f"Original dataset size: {len(df)}")
    print(f"Cleaned dataset size: {len(df_clean)}")
    print(f"Removed {len(df) - len(df_clean)} duplicate RGB entries")
    
    # Check if Amber is still there
    amber_in_clean = df_clean[df_clean['Name'] == 'Amber']
    if not amber_in_clean.empty:
        print(f"\nAmber is still in cleaned dataset: RGB [{amber_in_clean['R'].iloc[0]}, {amber_in_clean['G'].iloc[0]}, {amber_in_clean['B'].iloc[0]}]")
    else:
        print(f"\nAmber was removed during cleaning")
        # Find what color name is kept for RGB [255, 191, 0]
        amber_rgb_kept = df_clean[(df_clean['R'] == 255) & (df_clean['G'] == 191) & (df_clean['B'] == 0)]
        if not amber_rgb_kept.empty:
            print(f"RGB [255, 191, 0] is now labeled as: {amber_rgb_kept['Name'].iloc[0]}")
else:
    print("No duplicate RGB values found!")
    df_clean = df.copy()

Analyzing duplicate RGB values in the dataset...
Total rows in dataset: 865
Number of unique RGB combinations: 765
Number of RGB combinations with multiple names: 81

Total duplicate entries: 100

First 10 examples of RGB values with multiple names:
RGB [0, 51, 153]: ['Dark Powder Blue', 'Smalt (Dark Powder Blue)']
RGB [0, 65, 106]: ['Dark Imperial Blue', 'Indigo (Dye)']
RGB [0, 103, 165]: ['Medium Persian Blue', 'Sapphire Blue']
RGB [0, 123, 167]: ['Celadon Blue', 'Cerulean']
RGB [0, 128, 0]: ['Ao (English)', 'Green (Html/Css Green)', 'Office Green']
RGB [0, 191, 255]: ['Capri', 'Deep Sky Blue']
RGB [0, 255, 0]: ['Electric Green', 'Green (Color Wheel) (X11 Green)', 'Lime (Web) (X11 Green)']
RGB [0, 255, 127]: ['Guppie Green', 'Spring Green']
RGB [0, 255, 255]: ['Aqua', 'Cyan', 'Electric Cyan']
RGB [1, 68, 33]: ['Forest Green (Traditional)', 'Up Forest Green']

Cleaning dataset...
Original dataset size: 865
Cleaned dataset size: 765
Removed 100 duplicate RGB entries

Amber is still in 

In [10]:
# Retrain the model with cleaned dataset
print("Retraining model with cleaned dataset...")

# Prepare cleaned data
X_clean = df_clean[["R", "G", "B"]] / 255.0
y_clean = df_clean["Name"].values

# Encode the labels
le_clean = LabelEncoder()
y_clean_encoded = le_clean.fit_transform(y_clean)
y_clean_onehot = np.eye(len(np.unique(y_clean_encoded)))[y_clean_encoded]

print(f"Clean dataset shapes: X={X_clean.shape}, y={y_clean_onehot.shape}")

# Train-test split
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
    X_clean, y_clean_onehot, test_size=0.2, random_state=42
)

# Create and train new model
model_clean = tf.keras.Sequential([
    tf.keras.layers.Dense(32, input_dim=3, activation='relu'),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(y_clean_onehot.shape[1], activation="softmax")
])

model_clean.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
print("Training model with cleaned data...")
history = model_clean.fit(
    X_train_clean, y_train_clean, 
    epochs=150, 
    batch_size=16, 
    validation_data=[X_test_clean, y_test_clean],
    verbose=1
)

Retraining model with cleaned dataset...
Clean dataset shapes: X=(765, 3), y=(765, 765)
Training model with cleaned data...
Training model with cleaned data...
Epoch 1/150
Epoch 1/150


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0000e+00 - loss: 6.6433 - val_accuracy: 0.0000e+00 - val_loss: 6.6607
Epoch 2/150
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0000e+00 - loss: 6.6433 - val_accuracy: 0.0000e+00 - val_loss: 6.6607
Epoch 2/150
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0000e+00 - loss: 6.6304 - val_accuracy: 0.0000e+00 - val_loss: 6.7209
Epoch 3/150
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0000e+00 - loss: 6.6304 - val_accuracy: 0.0000e+00 - val_loss: 6.7209
Epoch 3/150
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0000e+00 - loss: 6.6011 - val_accuracy: 0.0000e+00 - val_loss: 6.9708
Epoch 4/150
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0000e+00 - loss: 6.6011 - val_accuracy: 0.0000e+00 - val_loss: 6.970