In [2]:
import json
from pathlib import Path
from elasticsearch import Elasticsearch
from process_framework import Reference
from process_framework.steps.versioning import GetElasticDocumentVersions
from pandas import DataFrame, Series, Index

import os
assert (root := os.environ["WORKSPACE_ROOT"]), 'expected to find `WORKSPACE_ROOT` env var with path to workspace root'

In [3]:
INDEX = 'lhh-historic-places-live'
elasticsearch = Elasticsearch(**json.loads(Path(root, 'secrets', 'elasticsearch.json').read_text()))
elasticsearch

<Elasticsearch(['https://746546c864f349c8b41303b0a122ca9b.uksouth.azure.elastic-cloud.com:443'])>

In [4]:
remote = Reference(Index)
remote

Reference[Index](None)

In [5]:
get_remote_versions = GetElasticDocumentVersions(
    remote,
    elasticsearch,
    INDEX,
    'nhle_id', ('sort', int),
    include_id=False
)

get_remote_versions.do()

In [6]:
remote.get_value()

MultiIndex([('1037610',      1),
            ('1487251',      3),
            ('1260315',      4),
            ('1001031',      5),
            ('1013282',      7),
            ('1040293',     10),
            ('1002059',     11),
            ('1257777',     12),
            ('1408333',     13),
            ('1416933',     23),
            ...
            ('1363435', 396448),
            ('1362790', 396990),
            ('1294025', 397656),
            ('1292405', 397874),
            ('1292506', 397935),
            ('1367463', 399095),
            ('1375222', 399393),
            ('1375220', 399394),
            ('1375158', 399433),
            ('1372826', 400850)],
           names=['nhle_id', 'sort'], length=14084)

In [7]:
local = Reference(Index)
get_local_versions = GetElasticDocumentVersions(
    local,
    elasticsearch,
    'search-nhle-dev',
    ('version_uid', int),
    include_id=True
)
get_local_versions.do()

In [8]:
local.get_value()

MultiIndex([('1412379', 476238),
            ('1412381', 476240),
            ('1412396', 476255),
            ('1412415', 476276),
            ('1412506', 476373),
            ('1412515', 476382),
            ('1412546', 476414),
            ('1412593', 476461),
            ('1412603', 476473),
            ('1412649', 476520),
            ...
            ('1359694', 325056),
            ('1359695', 452883),
            ('1359696', 325058),
            ('1359697', 325059),
            ('1359698', 325060),
            ('1359699', 325061),
            ('1359700', 325062),
            ('1359701', 325063),
            ('1359702', 325064),
            ('1359703', 325065)],
           names=['_id', 'version_uid'], length=401644)

In [9]:
additions = Reference(Index)
updates = Reference(Index)
deletions = Reference(Index)

In [10]:
from process_framework.steps.versioning.changes import DetectAdditions, DetectDeletions, DetectUpdates

detect_additions = DetectAdditions(local=local, remote=remote, assign_to=additions)
detect_updates = DetectUpdates(local=local, remote=remote, assign_to=updates)
detect_deletions = DetectDeletions(local=local, remote=remote, assign_to=deletions)

detect_additions.do()
detect_updates.do()
detect_deletions.do()

In [11]:
additions.get_value()

Index(['1000012', '1000016', '1000022', '1000024', '1000029', '1000031',
       '1000037', '1000038', '1000040', '1000041',
       ...
       '1494013', '1494063', '1494064', '1494065', '1494066', '1494067',
       '1494068', '1494103', '1494500', '1494730'],
      dtype='object', length=387561)

In [13]:
deletions.get_value()

Index(['1429315'], dtype='object')

In [14]:
updates.get_value()

Index(['1000000', '1000001', '1000002', '1000003', '1000004', '1000005',
       '1000006', '1000007', '1000008', '1000009',
       ...
       '1494013', '1494063', '1494064', '1494065', '1494066', '1494067',
       '1494068', '1494103', '1494500', '1494730'],
      dtype='object', length=401643)