### 库

In [1]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from tqdm.notebook import tqdm, trange, tnrange
import json, re

In [2]:
import numpy as np
import pandas as pd
import os, sys
from time import time, sleep

In [3]:
from vVariableInspector import _getshapeof
from vUtil.vFile import fprint, readlines, linesReader
from vUtil.vEmail import sendEmail
from vUtil.vTime import convertSeconds, getNow

In [9]:
from vMysql import MysqlProxy

In [10]:
from util import frmt, rmUnseen, groupConcat, deleteIndexRows
from cfg import host, master, port, contType
from cfg import dbHost, dbUser, dbPwd
from cfg import ncols, nIdSep

In [11]:
from elasticsearch import TransportError

### utils

In [12]:
def ourError (error, errorType = ''):
    fprint(f'[ERROR] ({getNow()}) {errorType} : {repr(error)}\n', file='error.txt', path='errorScholar')

def ourLog (log, logType = ''):
    fprint(f'[LOG] ({getNow()}) {logType} : {repr(log)}\n', file='log.txt', path='logScholar')

In [13]:
def esRestart ():
    global es
    es.close()
    es=Elasticsearch(hosts=host,port=port,timeout=60)

### mysql、es

In [14]:
index = 'landinn_scholar'
indexHl = 'landinn_scholar_highlight'

In [15]:
es=Elasticsearch(hosts=host,port=port,timeout=60)

In [16]:
db = MysqlProxy(ip=dbHost, user=dbUser, password=dbPwd)

### sql语句

In [17]:
sTmpScholar = '''
create temporary table tmp_scholar
(
    select golaxy_author_id as scholar_id from authors
    where !ifnull(is_deleted, 0) and !ifnull(is_new, 0)
    limit %d, %d
);
'''

sTmpScholarId = '''
create temporary table tmp_scholar
(
    select golaxy_author_id as scholar_id from authors where golaxy_author_id = %d
);
'''

sRmTmpScholar = '''
drop temporary table tmp_scholar;
'''

In [18]:
sTmpPublish = '''
create temporary table tmp_publish
(
    select author_id as scholar_id, paper_id as publish_id 
    from 
        paper_author_affiliations as a
    join
        tmp_scholar as b
    on a.author_id = b.scholar_id
    where !ifnull(a.is_deleted, 0)
);'''

sInsertPublish = ['''
insert into tmp_publish 
(
    select author_id as scholar_id, patent_id as publish_id 
    from 
        patent_authors as a
    join
        tmp_scholar as b
    on a.author_id = b.scholar_id
    where !ifnull(a.is_deleted, 0)
);
''',
'''
insert into tmp_publish 
(
    select author_id as scholar_id, project_id as publish_id 
    from
        project_authors as a
    join
        tmp_scholar as b
    on a.author_id = b.scholar_id
    where !ifnull(a.is_deleted, 0)
);
''']

sRmTmpPublish = '''
drop temporary table tmp_publish;
'''

In [19]:
def addPublishId (x, n = nIdSep):
    if n is None:
        s = "'曓攨爩氎廤攨攡擟戅嬼壣(',publish_id,')'"
    else: s = ','.join(["'(',publish_id,')'"] * n)
    return f'''concat({s},{x})'''

In [20]:
def getSqlSelectData (n = nIdSep):
    return f'''
    select  scholar_id, 
            {addPublishId("paper_title", n)} as paper_title, 
            {addPublishId("keywords", n)} as keywords, 
            {addPublishId("abstract", n)} as abstract, 
            {addPublishId("patent_title", n)} as patent_title, 
            {addPublishId("signory", n)} as signory, 
            {addPublishId("summary", n)} as summary, 
            {addPublishId("project_title", n)} as project_title, 
            {addPublishId("if (d1 is null and d2 is null and d3 is null,null,concat('{',ifnull(d1,''),'}{',ifnull(d2,''),'}{',ifnull(d3,''),'}'))",n)} as discipline,
            {addPublishId("ifnull(project_description, project_description_en)", n)} as project_description
    from
    (
        select  scholar_id, publish_id, paper_title, keywords, abstract, patent_title, signory, summary,
                ifnull(project_title, project_title_en) as project_title,
                ifnull(discipline_first, discipline_first_en) as d1,
                ifnull(discipline_secondary, discipline_secondary_en) as d2,
                ifnull(discipline_tertiary, discipline_tertiary_en) as d3 from
        (
            select  scholar_id, publish_id, paper_title, keywords, abstract, patent_title, 
                    ifnull(patent_signory, patent_signory_en) as signory, 
                    ifnull(patent_abstract, patent_abstract_en) as summary from 
            (
                select scholar_id, publish_id, paper_title, keywords, abstract, ifnull(patent_title, patent_title_en) as patent_title from
                (
                    select scholar_id, publish_id, paper_title, keywords, ifnull(abstract, abstract_en) as abstract from
                    (
                        select scholar_id, publish_id, ifnull(paper_title, paper_title_en) as paper_title, ifnull(keyword, keyword_en) as keywords
                        from tmp_publish as a 
                        left join papers on a.publish_id = papers.golaxy_paper_id
                        where !ifnull(papers.is_deleted, 0)
                    ) as b
                    left join papers_abstracts on b.publish_id = papers_abstracts.paper_id
                    where !ifnull(papers_abstracts.is_deleted, 0)
                ) as c
                left join patent on c.publish_id = patent.golaxy_patent_id
                where !ifnull(patent.is_deleted, 0)
            )as d
            left join patent_abstracts on d.publish_id = patent_abstracts.patent_id
            where !ifnull(patent_abstracts.is_deleted, 0)
        ) as e
        left join project on e.publish_id = project.golaxy_project_id
        where !ifnull(project.is_deleted, 0)
    )as f
    left join project_abstracts on f.publish_id = project_abstracts.project_id
    where !ifnull(project_abstracts.is_deleted, 0);
    '''

In [21]:
sSelectData = getSqlSelectData()
sSelectDataMini = getSqlSelectData(None) 

In [1]:
get_ipython()

<ipykernel.zmqshell.ZMQInteractiveShell at 0x2592b4bd4c0>

In [22]:
sSelectScholarInfo = '''
select  scholar_id, 1 as is_chinese, if(phone is null, 0, 1) as has_phone, if(email is null, 0, 1) as has_email,
        ifnull(title, title_en) as title, birthday, display_name as name, brief
from tmp_scholar as a
join authors as b
on a.scholar_id = b.golaxy_author_id
where !ifnull(b.is_deleted,0) and !ifnull(b.is_new, 0);
'''

### 函数

In [23]:
def stdData (data):
    return {
        data['scholar_id'][i] : {
            "paper_title" : data['paper_title'][i],
            "abstract" : data['abstract'][i],
            "keywords" : data['keywords'][i],
            "patent_title" : data['patent_title'][i],
            "signory" : data['signory'][i],
            "summary" : data['summary'][i],
            "project_title" : data['project_title'][i],
            "discipline" : data['discipline'][i],
            "project_description" : data['project_description'][i],
        } for i in range(len(data))
    }

In [24]:
def stdInfo (info):
    return {
        info['scholar_id'][i] : {
            'is_chinese' : info['is_chinese'][i],
            'has_email' : info['has_email'][i],
            'has_phone' : info['has_phone'][i],
            'birthday' : None if info['birthday'][i] is pd.NaT else info['birthday'][i],
            'title' : info['title'][i],
            'name' : info['name'][i],
            'brief' : info['brief'][i],
        } for i in range(len(info))
    }

In [25]:
def getActions (data, info):
    actions = []
    for id in info:
        action={'_op_type':'index',###操作 index update create delete  
            '_index': index,#index
            '_id' : str(id),
            '_source':
           {
                "scholar_id" : str(id),
                "paper_title" : data.get(id, {}).get('paper_title', ''),
                "paper_abstract" : data.get(id, {}).get('abstract', ''),
                "paper_keywords" : data.get(id, {}).get('keywords', ''),
                "patent_title" : data.get(id, {}).get('patent_title', ''),
                "patent_signory" : data.get(id, {}).get('signory', ''),
                "patent_summary" : data.get(id, {}).get('summary', ''),
                "project_title" : data.get(id, {}).get('project_title', ''),
                "project_discipline" : data.get(id, {}).get('discipline', ''),
                "project_description" : data.get(id, {}).get('description', ''),
                "is_chinese" : info[id]['is_chinese'],
                "has_email" : info[id]['has_email'],
                "has_phone" : info[id]['has_phone'],
                "birthday" : info[id]['birthday'],
                "title" : info[id]['title'],
                "name" : info[id]['name'],
                "brief" : info[id]['brief'],
            }
        }
        actions.append(action)
    return actions

In [26]:
def getHighLightActions (data, info):
    actions = []
    for id in info:
        action={'_op_type':'index',###操作 index update create delete  
            '_index': indexHl,#index
            '_id' : str(id),
            '_source':
           {
                "scholar_id" : str(id),
                "is_chinese" : info[id]['is_chinese'],
                "has_email" : info[id]['has_email'],
                "has_phone" : info[id]['has_phone'],
                "birthday" : info[id]['birthday'],
                "title" : info[id]['title'],
                "name" : info[id]['name'],
                "brief" : info[id]['brief'],
            }
        }
        actions.append(action)
    return actions

In [22]:
def getIndexActions (id):
    return [
        {
            '_op_type':'index',###操作 index update create delete  
            '_index': index,#index
            '_id' : str(id),
            '_source':
            {
                "scholar_id" : str(id),
            }
        }
    ]

In [23]:
def getUpdateActions (id, key, value):
    return [
        {
            '_op_type':'update',###操作 index update create delete  
            '_index': index,#index
            '_id' : str(id),
            'doc':
            {
                key : value
            }
        }
    ]

In [27]:
def __getData (mode, *args):
    while 1:
        try:
            db.sql((sTmpScholar if mode else sTmpScholarId) % args)
            db.sql(sTmpPublish)
            for x in sInsertPublish: db.sql(x)

            data = db.sql(sSelectData if mode else sSelectDataMini)

            data = groupConcat(data)
            info = db.sql(sSelectScholarInfo)
            db.sql(sRmTmpScholar)
            db.sql(sRmTmpPublish)
            db.close()
        except pymysql.Error as e:
            ourError(str(e), 'mysql read error')
            db.close()
            continue
        break
    return stdData(data), stdInfo(info)

In [28]:
def getRangeData (now, sizeBulk):
    return __getData(1, now, sizeBulk)
def getIdData (id):
    return __getData(0, id)

In [29]:
def miniInsert (data, info):
    for id in tqdm(info, leave=False):
        actions = getActions(data, {id : info[id]})
        try:
            helpers.bulk(client=es,actions=actions)
        except TransportError as e:
            if 'Data too large' in str(e) or e.status_code == 413:
                ourError(str(e), f'Data too large with id({id})')
                esRestart()
                
                data, info = getIdData(id)
                actions = getActions(data, info)
                helpers.bulk(client=es,actions=actions)
            else: raise e

### 索引

In [30]:
table = 'authors'

In [31]:
deleteIndexRows(db, es, table, 'golaxy_author_id', hasIsNew=True, index=[index, indexHl])

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=37.0), HTML(value='')), layout=Layout(dis…




{'landinn_scholar': 0, 'landinn_scholar_highlight': 0}

In [27]:
sizeBulk = 20
now = 0
nScholar = db.count(table, where='!ifnull(is_deleted,0)').values.item()

In [28]:
# now = 613860 

In [29]:
# sizeBulk = 20

In [30]:
# nScholar = 531720 

In [32]:
now, sizeBulk, nScholar

(0, 20, 596945)

In [33]:
startOfAll = time()
tr = trange(now, nScholar, sizeBulk, ncols=ncols)
for i in tr:
    tr.set_description(f'({getNow()}){i}')
    now = i
        
    data, info = getRangeData(now, sizeBulk)
    
    while 1:
        try:
            helpers.bulk(client=es,actions=getHighLightActions(data,info))
            break
        except ConnectionTimeout as e:
            ourError(str(e), f'connetion error indexing hl with Range({now},{now + sizeBulk})')

    actions = getActions(data, info)
    try:
        helpers.bulk(client=es,actions=actions)
    except TransportError as e:
        if 'Data too large' in str(e) or e.status_code == 413:
            ourError(str(e), f'Data too large({now},{sizeBulk})')
            esRestart()
            miniInsert (data, info)
        else: raise e
        
sendEmail(f'insert cost time {convertSeconds(time() - startOfAll)}', f'insert landinn scholar es complete host({host})')

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=29848.0), HTML(value='')), layout=Layout(…




### 关闭es和mysql

In [40]:
es.close()

In [41]:
db.close()

### 检查是否全部索引

In [95]:
a, b  = getRangeData(now,sizeBulk)

In [98]:
sizeBulk = 20
now = 0
for i in trange(now, nScholar, sizeBulk):
    
    
    now = i

    sql(sTmpScholar % (now, sizeBulk))

    info = sql(sSelectScholarInfo)
    sql(sRmTmpScholar)
    
    info = stdInfo(info)
    for id in info:
        x = es.search(index=index, size=20, body = {
            "query": {
                "term":
                {
                    "scholar_id": str(id)
                }
            }
        })['hits']['total']['value']
        if x == 0: print(now, id)

HBox(children=(FloatProgress(value=0.0, max=28025.0), HTML(value='')))

531660 1085425043282198716
531660 1052706856245949717
531660 1078295748546144714
531660 1052707457654615122
531660 1078188351634816031
531660 1078200861096157269
531660 1078321378419550461
531660 1078191764607081356
531660 1078278912886708079
531660 1078315243247530740
531660 1078187993512555252
531660 1078160262712284689
531660 1078223142992239852
531660 1052707673980028752
531660 1078191558868080168
531660 1052706758485089656
531660 1078280276635296532
531660 1078187210616348235
531660 1085416048236560464
531660 1078294466565512961
531680 1078271117248450416
531680 1078190742555853086
531680 1078159398639517711
531680 1085397714573066516
531680 1032650177554808844
531680 1078203293847007676
531680 1078291310599210272
531680 1078233836395446382
531680 1078271807999989137
531680 1032642430545297425
531680 1078234214583255885
531680 1078228078945637387
531680 1078284632256432125
531680 1078244796803848805
531680 1078201834094995239
531680 1078191116520000229
531680 1078212888652941387
5