In [None]:
import time
import os
from elasticsearch import Elasticsearch
import requests
from piffle.iiif import IIIFImageClient
import json
import math
import numpy as np
import collections

In [None]:
source_name = 'lab_clustering_hex'
target_name = 'proportion_searching_index'

In [None]:
def create_es_client():
    es = Elasticsearch(
        hosts=os.environ['LOCAL_HOST'],
        http_auth=(
            os.environ['LOCAL_USER'],
            os.environ['LOCAL_PASS']
        )
    )
    accepting_connections = es.ping()
    while not accepting_connections:
        time.sleep(1)
        accepting_connections = es.ping()        
    return es
    
def Extracting_clustering_info(response):  # run through each page and get the url
    for doc in response['hits']['hits']:
        info = {'id': doc['_id'], 'url': doc['_source']['iiif_url'],'cluster_number': doc['_source']['cluster_info']['cluster_number'],'lab_centers_list': doc['_source']['cluster_info']['lab_centers_list'], 'proportion': doc['_source']['cluster_info']['cluster_proportion']}
        yield info  # this is convenient! 

def get_proportion_sign(cluster_number, proportion): #Divide the proportion of clusters into three levels
    standard = 1/cluster_number
    if proportion>=1.5*standard:
        sign = 'L' 
    elif 0.5*standard<=proportion<1.5*standard:
        sign = 'M'
    elif proportion<0.5*standard:
        sign = 'S'
    return sign
        
def labcenters2searchterm(cluster_number, sorted_pclist):  ##color searching term. Devide the lab space into 10*16*16 cubics. The cubic where the cluster centers are located represents a dominant color.
    searchstr = str(cluster_number)
    for center in sorted_pclist:
        l = str(math.floor(center[1][0]*10/256)).zfill(2)
        a = str(math.floor(center[1][1]/16)).zfill(2)
        b = str(math.floor(center[1][2]/16)).zfill(2)
        searchstr = searchstr+'_'+get_proportion_sign(cluster_number,center[0])+l+a+b
    return searchstr
    

In [None]:
def index_traversing(size, target_name, source_name):
    # run for 1 time and get a last_result
    es = create_es_client()
    #####document this after first run
#     # create a new index
#     es.indices.create(
#     index=target_name,
#     )
    #####
    response = es.search(
        index=source_name,
        body={
            "query": {"match_all": {}},
            "size": size,
            "from": 0,
            "sort": {
                "_id": "asc",  # document ID.
            },
        }
    )
#     post the first 20 dictionaries
    for info in Extracting_clustering_info(response):
        print(info['id'])
        image_url = str(IIIFImageClient().init_from_url(info['url']).size(width=50))
        
        pc_list = []
        for proportion, lab_centers in zip(info['proportion'],info['lab_centers_list']):
            pc_list.append([proportion, lab_centers])
        pc_list.sort()
        searchstr = labcenters2searchterm(info['cluster_number'],pc_list)
#         print(searchstr)
#         print(image_url)
        #update the document
        es.index( 
            index=target_name,
            id=info['id'],
            body={
                "iiif_url": info['url'],
                "image_url": image_url,
                "pcsearch_info": {
                    "cluster_number": info['cluster_number'],
                    "lab_centers_list": info['lab_centers_list'],
                    "proportion": info['proportion'],
                    "searchstr": searchstr,
                }
            }
        )
    last_result_id = response['hits']['hits'][size - 1]['_id']
#     last_result_id = 'aatpy3rc'
    counter = size  # just to make the waiting time less painful

    while len(response['hits']['hits']) == size:  # get in the loop!
        response = es.search(
            index=source_name,
            body={
                "query": {"match_all": {}},
                "size": size,
                "from": 0,
                "sort": {
                    "_id": "asc",  # document ID.
                },
                "search_after": [last_result_id],
            }
        )
        for info in Extracting_clustering_info(response):
            print(info['id'])
            image_url = str(IIIFImageClient().init_from_url(info['url']).size(width=50))
            pc_list = []
            for proportion, lab_centers in zip(info['proportion'],info['lab_centers_list']):
                pc_list.append([proportion, lab_centers])
            pc_list.sort()
            searchstr = labcenters2searchterm(info['cluster_number'],pc_list)
#             print(searchstr)
#             print(image_url)
            #update the document
            es.index( 
                index=target_name,
                id=info['id'],
                body={
                    "iiif_url": info['url'],
                    "image_url": image_url,
                    "divide_info": {
                        "cluster_number": info['cluster_number'],
                        "lab_centers_list": info['lab_centers_list'],
                        "proportion": info['proportion'],
                        "searchstr": searchstr,
                    }
                }
            )
        last_result_id = info['id']  # update the last ID
        counter += size
        print(counter)


index_traversing(20, target_name, source_name)

In [None]:
es = Elasticsearch(
    hosts=os.environ['LOCAL_HOST'],
    http_auth=(
        os.environ['LOCAL_USER'],
        os.environ['LOCAL_PASS']
    )
)

response = es.get(
    index='lab_clustering_hex',
    id='a4dvcaa5'
)

response