In [30]:
!pip install -r req.txt ## installs required dependencies from req.txt file 



In [41]:
import cv2 ## To read and process image
import numpy as np ## To handle image datasets as arrays
import pandas as pd ## To handle csv datasets
import matplotlib.pyplot as plt ## To create graphs and scatter plots
from sklearn.cluster import KMeans ## To group similar colors together
from collections import Counter ## To identify primary colors
from PIL import Image ## For working with images

In [34]:
from datasets import load_dataset

ds = load_dataset("tungdop2/pokemon") ## loads the required dataset for further use

print(ds)

DatasetDict({
    train: Dataset({
        features: ['image', 'name', 'type_1', 'type_2', 'caption'],
        num_rows: 1271
    })
})


We will be using K means clustering to analyse the dataset. 
K means clustering is an unsupervised machine learning algorithm. It works by segregating data into K groups which are sorted on the basis of similarity
Pokémon images contain many colors. We need to extract the dominant ones for classification.
K-Means helps group similar pixel colors together and finds the most predominant color.

In [51]:
def get_dominant_color(image, k=3):

    image = image.convert("RGB") ## This is important because the dataset we are handling has some greyscale or RGBA images
    ## This will prevent "ValueError: cannot reshape array of size 902500 into shape (3)"
    
    img = np.array(image) ## Convert image into numpy array so that the computer can understand it
    
    pixels = img.reshape((-1, 3)) ## Reshape the array into a list of pixels. So each pixel will be characterized by the amount of RGB in it

    # Apply K-Means clustering
    
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) 
    ## K-Means randomly initializes centroids and sometimes it gets stuck in local minima
    ## n_init chooses different inital value cases so that this can be avoided
    
    kmeans.fit(pixels)

    counts = Counter(kmeans.labels_) ## counts each cluster instance
    dominant_cluster = counts.most_common(1)[0][0] ## chooses the most dominant cluster
    dominant_color = kmeans.cluster_centers_[dominant_cluster]
    
    return dominant_color.astype(int)

An interesting point to note here is something I was confused with at the beginning. Aren't we supposed to split the data into train and test? But it is actually not required since this is a unsupervised learning model and not a classification or regression model. The model is simply segregating the data based on similarity levels.

In [53]:
pokemon_color = []
for i in range(len(ds["train"])):
    image = ds["train"][i]["image"]
    dominant_color = get_dominant_color(image)
    pokemon_color.append({ds["train"][i]["name"]:tuple(dominant_color)})
                       

In [57]:
df = pd.DataFrame(pokemon_color)
print(df.head())

    abomasnow-mega        abomasnow             abra       absol-mega  \
0  (247, 249, 251)              NaN              NaN              NaN   
1              NaN  (250, 250, 251)              NaN              NaN   
2              NaN              NaN  (254, 254, 254)              NaN   
3              NaN              NaN              NaN  (247, 249, 252)   
4              NaN              NaN              NaN              NaN   

             absol accelgor aegislash-blade aegislash-shield aerodactyl-mega  \
0              NaN      NaN             NaN              NaN             NaN   
1              NaN      NaN             NaN              NaN             NaN   
2              NaN      NaN             NaN              NaN             NaN   
3              NaN      NaN             NaN              NaN             NaN   
4  (249, 250, 252)      NaN             NaN              NaN             NaN   

  aerodactyl  ... zoroark zorua-hisui zorua zubat zweilous  \
0        NaN  ... 

In [85]:
new_df = pd.DataFrame(columns=["Name", "Color"])

for col in df.columns:
    for idx, val in enumerate(df[col]):
        if pd.notna(val):
            new_df = pd.concat([new_df, pd.DataFrame([{"Name": col, "Color": val}])], ignore_index=True)


new_df["Color"] = new_df["Color"].apply(lambda x: eval(x) if isinstance(x, str) else x)


In [89]:
new_df.to_csv("pokemon_color.csv", index=False)
