## Testing

In [None]:
import os
from elasticsearch import Elasticsearch
from piffle.iiif import IIIFImageClient 
from io import BytesIO
import httpx
from PIL import Image

In [None]:
local_es = Elasticsearch(
    hosts=os.environ['LOCAL_HOST'],
    http_auth=(
        os.environ['LOCAL_USER'],
        os.environ['LOCAL_PASS']
    )
)

In [None]:
response = local_es.search(
    index=os.environ['INDEX_NAME'],
    body={
        "query":{
            "match_all":{}
        },
        "size":20,
        "from":0,
        "sort":{
            "_id":"asc", #document ID. Unique(I guess)
        },
    }
)

In [None]:
print(
    f"Found {response['hits']['total']['value']} "
    f"results in {response['took'] / 1000}s"
)

It still gets 10000 results for each run. Can this be optimised?

In [None]:
#get the id for the last hit
last_result = response['hits']['hits'][19]
print(last_result['_id'])

In [None]:
#checking the image
iiif_url = last_result['_source']['state']['derivedData']['thumbnail']['url']
image_url = str(IIIFImageClient().init_from_url(iiif_url).size(width=500))
Image.open(BytesIO(httpx.get(image_url).content))

In [None]:
#Send a query again, searching after the last ID.
response = local_es.search(
    index=os.environ['INDEX_NAME'],
    body={
        "query":{
            "match_all":{}
        },
        "size":20,
        "from":0,
        "search_after": [last_result['_id']],
        "sort":{
            "_id":"asc", #ascending order
        },
    }
)

In [None]:
first_result = response['hits']['hits'][0]
print(first_result['_id'])

In [None]:
#checking the image
iiif_url = first_result['_source']['state']['derivedData']['thumbnail']['url']
image_url = str(IIIFImageClient().init_from_url(iiif_url).size(width=500))
Image.open(BytesIO(httpx.get(image_url).content))

Check if the image sorted as expected.

In [None]:
response = local_es.search(
    index=os.environ['INDEX_NAME'],
    body={
        "query":{
            "match_all":{}
        },
        "size":30,
        "from":0,
        "search_after": ['a24yqdyw'],
        "sort":{
            "_id":"desc", #descending order
        },
    }
)

first_result = response['hits']['hits'][0]
print(first_result['_id'])
#checking the image
iiif_url = first_result['_source']['state']['derivedData']['thumbnail']['url']
image_url = str(IIIFImageClient().init_from_url(iiif_url).size(width=500))
Image.open(BytesIO(httpx.get(image_url).content))



In [None]:
print(
    f"Found {response['hits']['total']['value']} "
    f"results in {response['took'] / 1000}s"
)

print()

This is weird. Why its still getting 10k results? There should only be 20.

In [None]:
print(len(response['hits']['hits']))
last_result = response['hits']['hits'][20]
print(last_result['_id'])

The hits does only have 20 elements. I'm confused. Is there any tool that I can use check the response JSON in a human eye frinedly way? (not the leeeeengthy data XD)

Any way, its functionally working. Time to load the process in a function!

## Functions and Script

In [None]:
import os
from elasticsearch import Elasticsearch
from piffle.iiif import IIIFImageClient 
from io import BytesIO
import httpx
from PIL import Image

#connect to ElasticSearch
local_es = Elasticsearch(
    hosts=os.environ['LOCAL_HOST'],
    http_auth=(
        os.environ['LOCAL_USER'],
        os.environ['LOCAL_PASS']
    )
)

#create a new index
local_es.indices.create(
    index='id_url_index', 

)
index_name = 'id_url_index'



def Extracting_url_ID(response): #run through each page and get the url
    for doc in response['hits']['hits']:
        ID_url_dict = {'id':doc['_id'],'url':doc['_source']['state']['derivedData']['thumbnail']['url']}
        yield ID_url_dict #this is convenient!

def index_traversing(es_object, size, index_name):
    #run for 1 time and get a last_result
    response = es_object.search(
        index=os.environ['INDEX_NAME'],
        body={
            "query":{"match_all":{}},
            "size":size,
            "from":0,
            "sort":{
                "_id":"asc", #document ID. Unique(I guess)
            },
        }
    )
    #post the first 20 dictionaries
    for ID_url_dict in Extracting_url_ID(response):
        es_object.create(
            index = index_name,
            id = ID_url_dict['id'],
            body = {
                "url": ID_url_dict['url']
            }
        )
        print(ID_url_dict['id'])
    last_result_id = response['hits']['hits'][size-1]['_id']
    counter = size #just to make the waiting time less painful 
    
    while len(response['hits']['hits'])==size: #get in the loop!
        response = es_object.search(
            index=os.environ['INDEX_NAME'],
            body={
                "query":{"match_all":{}},
                "size":size,
                "from":0,
                "sort":{
                    "_id":"asc", #document ID. Unique(I guess)
                },
                "search_after": [last_result_id],
            }
        )
        for ID_url_dict in Extracting_url_ID(response): #post the dictionaries
            es_object.create(
                index = index_name,
                id = ID_url_dict['id'],
                body = {
                    "url": ID_url_dict['url']
                }
            )
            print(ID_url_dict['id']) #just to make the waiting time less painful 
        last_result_id = ID_url_dict['id'] #update the last ID
        counter+=size
        print(counter)
            
index_traversing(local_es, 20, index_name)


I need to stop this! XXXXXXD

In [None]:
local_es.indices.delete(
    index='id_url_index', 
)

In [None]:
import os
from elasticsearch import Elasticsearch
from piffle.iiif import IIIFImageClient 
from io import BytesIO
import httpx
from PIL import Image

#connect to ElasticSearch
local_es = Elasticsearch(
    hosts=os.environ['LOCAL_HOST'],
    http_auth=(
        os.environ['LOCAL_USER'],
        os.environ['LOCAL_PASS']
    )
)

#create a new index
local_es.indices.create(
    index='id_url_index_2', 
)
index_name = 'id_url_index_2'



def Extracting_url_ID(response): #run through each page and get the url
    for doc in response['hits']['hits']:
        ID_url_dict = {'id':doc['_id'],'url':doc['_source']['url']}
        yield ID_url_dict #this is convenient!

def index_traversing(es_object, size, index_name):
    #run for 1 time and get a last_result
    response = es_object.search(
        index='id_url_index',
        body={
            "query":{"match_all":{}},
            "size":size,
            "from":0,
            "sort":{
                "_id":"asc", #document ID. Unique(I guess)
            },
        }
    )
    #post the first 20 dictionaries
    for ID_url_dict in Extracting_url_ID(response):
        es_object.create(
            index = index_name,
            id = ID_url_dict['id'],
            body = {
                "url": ID_url_dict['url']
            }
        )
        print(ID_url_dict['id'])
    last_result_id = response['hits']['hits'][size-1]['_id']
    counter = size #just to make the waiting time less painful 
    
    while len(response['hits']['hits'])==size: #get in the loop!
        response = es_object.search(
            index='id_url_index',
            body={
                "query":{"match_all":{}},
                "size":size,
                "from":0,
                "sort":{
                    "_id":"asc", #document ID. Unique(I guess)
                },
                "search_after": [last_result_id],
            }
        )
        for ID_url_dict in Extracting_url_ID(response): #post the dictionaries
            es_object.create(
                index = index_name,
                id = ID_url_dict['id'],
                body = {
                    "url": ID_url_dict['url']
                }
            )
            print(ID_url_dict['id']) #just to make the waiting time less painful 
        last_result_id = ID_url_dict['id'] #update the last ID
        counter+=size
        print(counter)
            
index_traversing(local_es, 20, index_name)


In [None]:
local_es.indices.delete(
    index='id_url_index_2'
)

In [None]:
import os
from elasticsearch import Elasticsearch
from piffle.iiif import IIIFImageClient 
from io import BytesIO
import httpx
from PIL import Image

#connect to ElasticSearch
local_es = Elasticsearch(
    hosts=os.environ['LOCAL_HOST'],
    http_auth=(
        os.environ['LOCAL_USER'],
        os.environ['LOCAL_PASS']
    )
)
response = local_es.search(
    index='id_url_index',
    body={
        "query":{
            "match_all":{}
        },
        "size":20,
        "from":0,
    }
)

response




In [None]:
response = local_es.get(
    index = 'id_url_index',
    id = 'a2ce3qkt',
)
response

In [None]:
response = local_es.count(
        index='id_url_index',
    )

response