### 库

In [1]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from tqdm.notebook import tqdm, trange, tnrange
import json, re

In [2]:
import numpy as np
import pandas as pd
import os, sys
from time import time, sleep

In [3]:
from vVariableInspector import _getshapeof
from vUtil.vFile import fprint, readlines, linesReader
from vUtil.vEmail import sendEmail
from vUtil.vTime import convertSeconds, getNow

In [4]:
from vMysql import MysqlProxy

In [5]:
from util import frmt, rmUnseen, groupConcat, deleteIndexRows
from cfg import host, master, port, contType
from cfg import dbHost, dbUser, dbPwd
from cfg import ncols

### mysql、es

In [6]:
index = 'landinn_patent'

In [7]:
es=Elasticsearch(hosts=host,port=port)

In [8]:
db = MysqlProxy(ip=dbHost, user=dbUser, password=dbPwd)

### 函数

In [9]:
sTmpPatent = '''
create temporary table tmp_patent
(
    select  golaxy_patent_id as gid,
            ifnull(patent_title, patent_title_en) as title,
            applicant_date,
            grant_date,
            publication_date
    from patent
    where !ifnull(is_deleted,0)
    limit %d, %d
);
'''
sRmTmpPatent = '''
drop temporary table tmp_patent;
'''

In [10]:
sSelectPublishScholar = '''
select gid, cast(author_id as char) as scholar_id, null as enterprise_id
from
    tmp_patent as a
left join patent_authors as b
on a.gid = b.patent_id
where !ifnull(b.is_deleted,0);
'''

In [11]:
sSelectPublishEnterprise = '''
select gid, null as scholar_id, cast(applicant_id as char) as enterprise_id
from 
    tmp_patent as c
left join patent_applicants as d
on c.gid = d.patent_id
where !ifnull(d.is_deleted,0);
'''

In [12]:
sSelectData = '''
select  gid, title, applicant_date, grant_date, publication_date,
        ifnull(patent_signory, patent_signory_en) as signory, 
        ifnull(patent_abstract, patent_abstract_en) as summary
from
    tmp_patent as a
left join patent_abstracts as b on a.gid = b.patent_id
where !ifnull(b.is_deleted,0);
'''

In [13]:
def getActions (data, publish):
    actions = []
    
    p2s = {publish['gid'][i] : publish['scholar_id'][i] for i in range(len(publish))}
    p2e = {publish['gid'][i] : publish['enterprise_id'][i] for i in range(len(publish))}
    
    for i in range(len(data)):
#         if data['gid'][i] not in p2s: continue
        action={'_op_type':'index',#操作 index update create delete  
            '_index':index,#index
            '_id' : data['gid'][i],
            '_source':
           {
                "id" : data['gid'][i],
                "title" : rmUnseen(data['title'][i], None),
                "signory" : rmUnseen(data['signory'][i], None),
                "summary" : rmUnseen(data['summary'][i], None),
                "scholars" : p2s[data['gid'][i]],
                "enterprises" : p2e[data['gid'][i]],
                "applicant_date" : None if data['applicant_date'][i] is pd.NaT else data['applicant_date'][i],
                "grant_date" : None if data['grant_date'][i] is pd.NaT else data['grant_date'][i],
                "publication_date" : None if data['publication_date'][i] is pd.NaT else data['publication_date'][i]
            }
        }
        actions.append(action)
    return actions

### 索引

In [14]:
table = 'patent'

In [15]:
deleteIndexRows(db, es, table, 'golaxy_patent_id', index=index)

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=1.0), HTML(value='')), layout=Layout(disp…




{'landinn_patent': 0}

In [14]:
sizeBulk = 100
now = 0
nPatent = db.count(table, where='!ifnull(is_deleted,0)').values.item()

In [15]:
# now = 120

In [16]:
# sizeBulk = 2

In [17]:
# nPatent = 300

In [18]:
now, sizeBulk, nPatent

(0, 100, 1839824)

In [19]:
startOfAll = time()
tr = trange(now, nPatent, sizeBulk, ncols=ncols)
for i in tr:
    tr.set_description(f'({getNow()}){i}')
    now = i
    
    db.sql(sTmpPatent % (now, sizeBulk))
    
    data = db.sql(sSelectData)
    ps = db.sql(sSelectPublishScholar)
    pe = db.sql(sSelectPublishEnterprise)
    publish = ps.append(pe, ignore_index=True)
    db.sql(sRmTmpPatent)
    db.close()
    
    actions = getActions(data, groupConcat(publish, 'gid', ';'))
    if actions: helpers.bulk(client=es,actions=actions)
sendEmail(f'insert cost time {convertSeconds(time() - startOfAll)}', 'insert landinn es patents complete')

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=18399.0), HTML(value='')), layout=Layout(…




### 关闭

In [20]:
es.close()

In [21]:
db.close()

### 检查是否全部索引

In [81]:
sizeBulk = 100
now = 0
tr = trange(now, nPatent, sizeBulk, ncols=ncols)
for i in tr:
    tr.set_description(f'({getNow()}){i}')
    
    now = i

    db.sql(sTmpPatent % (now, sizeBulk))
    
    data = db.sql(sSelectData)
    db.sql(sRmTmpPatent)
    db.close()
    


    for i in range(len(data)):
        x = es.search(index=index, size=20, body = {
            "query": {
                "term":
                {
                    "id": data['gid'][i]
                }
            }
        })['hits']['total']['value']
        if x == 0: print(now, data['gid'][i])

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=18399.0), HTML(value='')), layout=Layout(…

200 1032528451470688534
200 1032528451470688548
200 1032528451470688591
300 1032528451470688657
300 1032528451470688671
300 1032528451470688680
400 1032528451470688799
400 1032528451470688864
400 1032528451470688876
500 1032528451470688948
600 1032528451470689105
700 1032528451470689255
700 1032528451470689307
800 1032528451470689463
1000 1032528451470689735
1200 1032528451470690006
1300 1032528451470690068
1400 1032528451470690186
1400 1032528451470690207
1400 1032528451470690226
1400 1032528451470690238
1500 1032528451470690298
1500 1032528451470690380
1600 1032528451470690425
1600 1032528451470690442
1600 1032528451470690455
1600 1032528451470690508
1700 1032528451470690576
1700 1032528451470690601
1700 1032528451470690625
1700 1032528451470690639
1800 1032528451470690779
1800 1032528451470690800
1900 1032528451470690905
2000 1032528451470690979
2000 1032528451470690997
2000 1032528451470691000
2000 1032528451470691007
2200 1032528451470691369
2300 1032528451470691392
2300 103252845

KeyboardInterrupt: 