### 库

In [1]:
import requests, os, sys, json, math
import numpy as np, pandas as pd

In [2]:
from vUtil.vFile import fprint

In [3]:
from vVariableInspector import _getshapeof

In [4]:
from vEs import EsProxy

In [5]:
from vMysql import MysqlProxy, stdSqlData, stdSqlCol
import pymysql
import pytz
from vUtil.vTime import getNow

In [6]:
from elasticsearch import Elasticsearch
from elasticsearch import TransportError, ConnectionError, ConnectionTimeout
from elasticsearch import helpers

In [7]:
from pandas._libs.tslibs.timedeltas import Timedelta as td
from pandas import Timestamp as ts

In [8]:
from vUtil.vTqdm import tqdm, trange
from vUtil.vLog import frmt, VLog, _print, print, vError, vLog
print.globals = globals()

### 配置

In [9]:
esHost = '172.17.184.30'
esPort = 9200
esMaster = f'http://{esHost}:{esPort}'### 库

In [10]:
db = MysqlProxy()

In [11]:
es = EsProxy()

### 函数

In [17]:
dftSize = 1000

In [34]:
def esScroll (es, index, scroll='10m', size=dftSize):
    query = '''
    {
        "query": {
            "match_all": {}
        },
        "sort" : "_doc",
        "_source": false
    }'''
    
    rst = es.search(index=index, body=query, scroll=scroll, size=size)
    rst
    
    if rst['hits']['hits']: yield rst['hits']
    
    while rst['hits']['hits']: 
        rst = es.scroll(f'''
        {{
            "scroll": "{scroll}", 
            "scroll_id" : "{rst['_scroll_id']}"
        }}
        ''')
        if rst['hits']['hits']: yield rst['hits']

In [35]:
def findEsInvalid (es, index, db, table, idCol, size=dftSize, hasIsNew=False):
    ids = []
    cnt = 0
    with tqdm(esScroll(es, index, size=size)) as tqs:
        for x in tqs:
            if not cnt:
                total = x['total']['value']
                tqs.total = math.ceil(total / size)
            print(f'{cnt}/{total}({len(ids)})', tqdm=tqs)
            cnt += len(x['hits'])
            x = [y['_id'] for y in x['hits']]
            unions = ' union '.join([f'select {repr(y)} as id' for y in x])
            rst = db(f'''
            select a.id from
            (
                {unions}
            ) as a
            left join {table} as b
            on a.id = b.{idCol}
            where b.{idCol} is null or is_deleted{' or is_new' if hasIsNew else ''};
            ''', ifCommit=True)
            ids += [*rst['id']]
        print(f'{cnt}/{total}({len(ids)})', tqdm=tqs)
    return ids

In [36]:
def rmEsInvalid (es, index, ids, size=dftSize):
    nDeleted = 0
    if ids:
        with trange(0, len(ids), size) as tr:
            for i in tr:
                body = {
                    "query": {
                        "terms": {"_id": ids[i:i+size]}
                    }
                }
                nDeleted += es.delete_by_query(index=index, body=body).get('deleted', 0)
    return nDeleted

In [37]:
def main (es, index, db, table, idCol, size=dftSize, hasIsNew=False):
    ids = findEsInvalid (es, index, db, table, idCol, size=size, hasIsNew=hasIsNew)
    nDeleted = rmEsInvalid (es, index, ids, size=size)
    return nDeleted

### 测试es scroll

In [59]:
index = 'test'

In [73]:
[*esScroll(es,index,size=1)]

[[{'_index': 'test',
   '_type': '_doc',
   '_id': '林伯威',
   '_score': None,
   'sort': [0]}],
 [{'_index': 'test',
   '_type': '_doc',
   '_id': 'cars中国科学院计算技术研究所<a></a>the a an hhh HHj',
   '_score': None,
   'sort': [1]}],
 [{'_index': 'test',
   '_type': '_doc',
   '_id': 'a b c d',
   '_score': None,
   'sort': [2]}],
 [{'_index': 'test',
   '_type': '_doc',
   '_id': '你好',
   '_score': None,
   'sort': [3]}]]

In [60]:
scroll = es.search(index=index, body=query, scroll="1m", size=1)
scroll

{'_scroll_id': 'DXF1ZXJ5QW5kRmV0Y2gBAAAAAABBNCkWQmptR1FzR3BTWk9ua09OU2NMdnpFdw==',
 'took': 0,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 4, 'relation': 'eq'},
  'max_score': None,
  'hits': [{'_index': 'test',
    '_type': '_doc',
    '_id': '林伯威',
    '_score': None,
    'sort': [0]}]}}

In [69]:
es.scroll(f'''
{{
    "scroll": "1m", 
    "scroll_id" : "{scroll['_scroll_id']}"
}}
''')

{'_scroll_id': 'DXF1ZXJ5QW5kRmV0Y2gBAAAAAABBNCkWQmptR1FzR3BTWk9ua09OU2NMdnpFdw==',
 'took': 1,
 'timed_out': False,
 'terminated_early': True,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 4, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

### rm invalid

In [28]:
main(es, 'landinn_scholar', db, 'authors', 'golaxy_author_id', hasIsNew=True)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=3.0), HTML(value='')), layout=Layout(disp…




827

In [38]:
main(es, 'landinn_scholar_highlight', db, 'authors', 'golaxy_author_id', hasIsNew=True)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=3.0), HTML(value='')), layout=Layout(disp…




2827

In [39]:
main(es, 'landinn_scholar_abroad', db, 'authors_en', 'golaxy_author_id', hasIsNew=True)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …




0

In [40]:
main(es, 'landinn_scholar_abroad_highlight', db, 'authors_en', 'golaxy_author_id', hasIsNew=True)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …




0

In [41]:
main(es, 'landinn_paper', db, 'papers', 'golaxy_paper_id')

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=1.0), HTML(value='')), layout=Layout(disp…




698

In [42]:
main(es, 'landinn_patent', db, 'patent', 'golaxy_patent_id')

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …




KeyboardInterrupt: 

In [43]:
main(es, 'landinn_project', db, 'project', 'golaxy_project_id')

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …




HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=1.0), HTML(value='')), layout=Layout(disp…




277

In [44]:
main(es, 'landinn_enterprise', db, 'affiliations', 'affiliation_id', hasIsNew=True)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …




0

In [45]:
main(es, 'landinn_enterprise_highlight', db, 'affiliations', 'affiliation_id', hasIsNew=True)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …




0

In [46]:
main(es, 'landinn_product', db, 'product', 'golaxy_product_id')

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …




0

In [47]:
main(es, 'landinn_software_copyright', db, 'software_copyright', 'golaxy_sc_id')

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …




0

### 关闭

In [57]:
es.close()

In [58]:
db.close()