### 库

In [1]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from tqdm.notebook import tqdm, trange, tnrange
import json, re

In [2]:
import numpy as np
import pandas as pd
import os, sys
from time import time, sleep

In [3]:
from vVariableInspector import _getshapeof
from vUtil.vFile import fprint, readlines, linesReader
from vUtil.vEmail import sendEmail
from vUtil.vTime import convertSeconds

In [4]:
from vMysql import MysqlProxy

In [5]:
from util import frmt, rmUnseen, groupConcat, deleteIndexRows
from cfg import host, master, port, contType
from cfg import dbHost, dbUser, dbPwd
from cfg import ncols

### mysql、es

In [6]:
index = 'landinn_software_copyright'

In [7]:
es=Elasticsearch(hosts=host,port=port)

In [8]:
db = MysqlProxy(ip=dbHost, user=dbUser, password=dbPwd)

### 函数

In [9]:
sTmpSoftwareCopyright = '''
create temporary table tmp_softwareCopyright
(
    select  golaxy_sc_id as gid, 
            full_name as name,
            simple_name as simpleName,
            registration_time,
            publish_time,
            event_time
    from software_copyright
    where !ifnull(is_deleted,0)
    limit %d, %d
);
'''

sRmTmpSoftwareCopyright = '''
drop temporary table tmp_softwareCopyright;
'''

In [10]:
sSelectPublish = '''
select gid, cast(affiliation_id as char) as enterprise_id
from
    tmp_softwareCopyright as a
left join softwareCopyright_affiliation as b
on a.gid = b.sc_id
where !ifnull(b.is_deleted,0);
'''

In [11]:
sSelectData = '''
select gid, name, simpleName, registration_time, publish_time, event_time
from tmp_softwareCopyright;
'''

In [12]:
def getActions (data, publish):
    actions = []
    
    p2e = {publish['gid'][i] : publish['enterprise_id'][i] for i in range(len(publish))}
    
    for i in range(len(data)):
        action={'_op_type':'index',#操作 index update create delete  
            '_index':index,#index
            '_id' : data['gid'][i],
            '_source':
           {
                "id" : data['gid'][i],
                "name" : rmUnseen(data['name'][i], None),
                "simpleName" : rmUnseen(data['simpleName'][i], None),
                "enterprises" : p2e[data['gid'][i]],
                "registration_time" : None if data['registration_time'][i] is pd.NaT else data['registration_time'][i],
                "publish_time" : None if data['publish_time'][i] is pd.NaT else data['publish_time'][i],
                "event_time" : None if data['event_time'][i] is pd.NaT else data['event_time'][i],
            }}
        actions.append(action)
    return actions

### 索引

In [13]:
table = 'software_copyright'

In [14]:
deleteIndexRows(db, es, table, 'golaxy_sc_id', index=index)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …




{'landinn_software_copyright': 0}

In [13]:
sizeBulk = 100
now = 0
nSoftwareCopyright = db.count(, where='!ifnull(is_deleted,0)').values.item()

In [14]:
# now = 120

In [15]:
# sizeBulk = 100

In [16]:
now, sizeBulk, nSoftwareCopyright

(0, 100, 24305)

In [17]:
startOfAll = time()
for i in trange(now, nSoftwareCopyright, sizeBulk):
    now = i
    db.sql(sTmpSoftwareCopyright % (now, sizeBulk))
    
    data = db.sql(sSelectData)
    publish = db.sql(sSelectPublish)
    db.sql(sRmTmpSoftwareCopyright)
    db.close()
    
    actions = getActions(data, groupConcat(publish, 'gid', ';'))
    if actions: helpers.bulk(client=es,actions=actions)
sendEmail(f'insert cost time {convertSeconds(time() - startOfAll)}', 'insert landinn es softwareCopyrights complete')

HBox(children=(FloatProgress(value=0.0, max=244.0), HTML(value='')))




### 关闭

In [18]:
es.close()

In [19]:
db.close()