In [None]:
import time
import os
from elasticsearch import Elasticsearch
import requests
from piffle.iiif import IIIFImageClient
import json
import math

In [None]:
index_name = 'lab_clustering_hex'

In [None]:
def create_es_client():
    es = Elasticsearch(
        hosts=os.environ['LOCAL_HOST'],
        http_auth=(
            os.environ['LOCAL_USER'],
            os.environ['LOCAL_PASS']
        )
    )
    accepting_connections = es.ping()
    while not accepting_connections:
        time.sleep(1)
        accepting_connections = es.ping()        
    return es
    
def Extracting_url_ID(response):  # run through each page and get the url
    for doc in response['hits']['hits']:
        ID_url_dict = {'id': doc['_id'], 'url': doc['_source']['state']['derivedData']['thumbnail']['url']}
        yield ID_url_dict  # this is convenient!

def clustering_api_request(image_url): #send request to clustering API
    header = {'image-url': image_url}
    print(image_url)
    response = requests.get('http://api:80/image/cluster/', headers = header)
    dr = json.loads(response.text)
    return dr['center_list'], dr['center_number'], dr['cluster_size'], dr['cluster_proportion'], dr['lab_center_list']

def hex2rgb(hexcolor):
    rgb = [(hexcolor >> 16) & 0xff,
        (hexcolor >> 8) & 0xff,
        hexcolor & 0xff
         ]
    return rgb

def rgb2hex(rgbcolor):
    r, g, b = rgbcolor
    return hex((r << 16) + (g << 8) + b)

def rgblist2hexlist(rgbcolorlist):
    hexcolorlist = []
    for rgbcenter in rgbcolorlist:
        hexcenter = rgb2hex(rgbcenter)
        hexcolorlist.append(hexcenter)
    return hexcolorlist

def labcenters2searchterm(cluster_number, lab_centers_list):  ##color searching term. Devide the lab space into 10*16*16 cubics. The cubic where the cluster centers are located represents a dominant color.
    searchstr = str(cluster_number)
    for center in lab_centers_list:
        l = str(math.floor(center[0]*10/256)).zfill(2)
        a = str(math.floor(center[1]/16)).zfill(2)
        b = str(math.floor(center[2]/16)).zfill(2)
        searchstr = searchstr+'_'+l+a+b
    return searchstr

def index_traversing(size, index_name):
    # run for 1 time and get a last_result
    es = create_es_client()
    #####document this after first run
#     # create a new index
#     es.indices.create(
#     index='lab_clustering_hex',
#     )
    #####
    response = es.search(
        index=os.environ['INDEX_NAME'],
        body={
            "query": {"match_all": {}},
            "size": size,
            "from": 0,
            "sort": {
                "_id": "asc",  # document ID.
            },
        }
    )
#     post the first 20 dictionaries
    for ID_url_dict in Extracting_url_ID(response):
        print(ID_url_dict['id'])
        image_url = str(IIIFImageClient().init_from_url(ID_url_dict['url']).size(width=50))
        cluster_centers_list, cluster_number, cluster_size, cluster_proportion,lab_centers_list = clustering_api_request(image_url)   #Clustering in lab space.
        cluster_centers_list_hex = rgblist2hexlist(cluster_centers_list)
        searchstr = labcenters2searchterm(cluster_number, lab_centers_list)
        #update the document
        es.index( 
            index=index_name,
            id=ID_url_dict['id'],
            body={
                "iiif_url": ID_url_dict['url'],
                "image_url": image_url,
                "cluster_info": {
                    "cluster_centers_list": cluster_centers_list,
                    "cluster_centers_list_hex": cluster_centers_list_hex,
                    "cluster_number": cluster_number,
                    "cluster_size": cluster_size,
                    "cluster_proportion": cluster_proportion,
                    "lab_centers_list": lab_centers_list,
                    "searchstr": searchstr,
                }
            }
        )
    last_result_id = response['hits']['hits'][size - 1]['_id']
    counter = size  # just to make the waiting time less painful

    while len(response['hits']['hits']) == size:  # get in the loop!
        response = es.search(
            index=os.environ['INDEX_NAME'],
            body={
                "query": {"match_all": {}},
                "size": size,
                "from": 0,
                "sort": {
                    "_id": "asc",  # document ID.
                },
                "search_after": [last_result_id],
            }
        )
        for ID_url_dict in Extracting_url_ID(response):  # post the dictionaries
            print(ID_url_dict['id'])
            image_url = str(IIIFImageClient().init_from_url(ID_url_dict['url']).size(width=50))
            cluster_centers_list, cluster_number, cluster_size, cluster_proportion, lab_centers_list = clustering_api_request(image_url)
            cluster_centers_list_hex = rgblist2hexlist(cluster_centers_list)
            searchstr = labcenters2searchterm(cluster_number, lab_centers_list)
            
            es.index(
                index=index_name,
                id=ID_url_dict['id'],
                body={
                    "iiif_url": ID_url_dict['url'],
                    "image_url": image_url,
                    "cluster_info":{
                        "cluster_centers_list": cluster_centers_list,
                        "cluster_centers_list_hex": cluster_centers_list_hex,
                        "cluster_number": cluster_number,
                        "cluster_size": cluster_size,
                        "cluster_proportion": cluster_proportion,
                        "lab_centers_list": lab_centers_list,
                        "searchstr": searchstr,
                    }

                }
            )
        last_result_id = ID_url_dict['id']  # update the last ID
        counter += size
        print(counter)


index_traversing(20, index_name)


In [None]:
header = {'image-url': 'https://dlcs.io/iiif-img/wellcome/5/b16753598_l0054283.jp2/full/50,/0/default.jpg'}
response = requests.get('http://api:80/image/cluster/', headers = header)
dr = json.loads(response.text)
print(dr)

In [None]:
es = Elasticsearch(
    hosts=os.environ['LOCAL_HOST'],
    http_auth=(
        os.environ['LOCAL_USER'],
        os.environ['LOCAL_PASS']
    )
)

response = es.get(
    index=index_name,
    id='e6aadpc4'
)

response

In [None]:
search_terms = '4_060808_020808_070808_040808'

We can already search with the hex color code.

In [None]:
response = es.search(
    index=index_name,
    body={
    "query": {
        "match": {
            "cluster_info.searchstr": search_terms
        }
    }
}
)
response

Now it's time to create a color ID for each image(search term). Lab space has 100*256*256 chunks. Each image has 3-5 clusters. Each of the clusters has 0-100% of the pixels. It should look like 'searchstr': '4_031008_050809_040810_080809'}}}. 4 means the number of clusters. After that was four 6-digit strings, each representing a lab coordinate. The coordinate correspond to a divided lab space, in which L was divided into 10, A into 16 and B into 16. There's 10*16*16 chunks. For example, 031008 means the L of the cluster center is in the range of 20-30, A in 160-176, B in 128-144.