In [1]:
import ast
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from collections import Counter 
# from environment.settings import config
from numpy import matlib
from sklearn.cluster import KMeans
from sqlalchemy import select, text, and_, or_
from sqlalchemy.sql import Select
from typing import Tuple
from utils import connections
from utils import database
import pickle

# database_dir = config['DATABASE_DIR']
# dataset_dir = config['DATASET_DIR']
dataset_dir = '../data/'

NUM_COLORS = 10

kmeans_model = KMeans(n_clusters=NUM_COLORS, n_init='auto')

Functions

In [2]:
def rgb2hex(rgb: np.ndarray):
    ''' Converts an N X 3 numpy array of RGB values into a list of hex strings'''
    hex_list = list(map(lambda x: '#%02x%02x%02x' % tuple(x), rgb))
    return hex_list

def extract_colors(model: KMeans, img: np.ndarray) -> Tuple[np.ndarray, Counter, np.ndarray]:
    ''' Extract the X most common colors from an image with a KMeans model '''       
    cluster_labels = model.fit_predict(img)
    return cluster_labels.astype(np.uint8), Counter(cluster_labels), model.cluster_centers_.astype(np.uint8)

We need to group the artworks by:
- Artist
- Movement
- Century
- Country

In [3]:
# Artist = pd.read_csv(dataset_dir+'Artist.csv')
# ArtistMovement = pd.read_csv(dataset_dir+'ArtistMovements.csv')
# Artwork = pd.read_csv(dataset_dir+'Artwork.csv')
# Movement = pd.read_csv(dataset_dir+'Movement.csv')
# ? Read Stratos' file
with open(dataset_dir+'img_pallets_faces.pkl', 'rb') as handle:
    faces_pallete = pickle.load(handle)

Cluster by artist

In [4]:
color_percentage_list = []
for item in faces_pallete:
    cluster_counts, rgb_colors = item['cluster_counts'], item['rgb_colors']

    labels = rgb2hex(rgb_colors)
    values = list(map(lambda x: x[1], sorted(cluster_counts.items())))
    # make sure that the percentages add up to 100
    percentages = list(map(lambda x: x / sum(values) * 100, values))
    # cumulative sum of the percentages
    cum_percentages = np.cumsum(percentages).round(0)
    # shift the percentages right by 1 and make the first element 0 (from 100)
    rolled_cp = np.roll(cum_percentages, 1)
    rolled_cp[0] = 0
    # substract the shifted percentages from the cumulative percentages
    # the result is the size of percentage of each color
    # we do this to ensure that percentages always add up to 100
    percentages = (cum_percentages - rolled_cp).astype(int)
    # get a 100 * 3 array of the colors
    color_percentages = [matlib.repmat(a=color, m=percentage, n=1) for color, percentage in zip(rgb_colors, percentages)] 
    color_percentage_list.append(np.vstack(color_percentages))

artist_colors = np.vstack(color_percentage_list)
_, cluster_counts, rgb_colors = extract_colors(kmeans_model, artist_colors)

In [5]:
df = pd.DataFrame({'cluster_counts':[cluster_counts], 'rgb_colors':[rgb_colors]})
df.cluster_counts = df.cluster_counts.astype(str).str.lstrip('Counter(').str.rstrip(')')
df.rgb_colors = df.rgb_colors.apply(lambda x: str([list(a) for a in x]))
df.to_csv(dataset_dir+'faces_supercluster.csv')