In [1]:
import pandas as pd
import json
import re
from time import time, sleep
import pymysql
from datetime import datetime
from pymysql import ProgrammingError

In [2]:
import numpy as np
import os
import sys

In [3]:
from tqdm.notebook import tqdm, trange, tnrange
from vVariableInspector import _getshapeof
from vUtil.vFile import fprint, readlines, linesReader
from vUtil.vEmail import sendEmail
from vUtil.vTime import getNow, convertSeconds
from vUtil.vUe import rmUe

In [4]:
import pymysql
from vMysql import addCursor, select, sql, count

In [5]:
from vToEnglish.detect import hasChinese, countChinese
from vToEnglish.translate import translate
from vToEnglish.toEnglish import wait, toEnglish

In [6]:
from simpleAndTranditional import trd2smp
from keywords.selected import kwSelected as words

In [7]:
from youdaoSynonym import YoudaoSynonym

In [19]:
def ourError (error, errorType = 'invalid keyword'):
    fprint(f'[ERROR] ({getNow()}) {repr(word)} : {errorType}: "{error}"\n', file='error.txt', path='logKeySelected')

In [20]:
def ourLog (log, logType = ''):
    fprint(f'[LOG] ({getNow()}) {repr(word)} : {logType} : "{log}"\n', file='log.txt', path='logKeySelected')

In [10]:
yd = YoudaoSynonym()

In [11]:
db = pymysql.connect("10.60.1.78", "root", "linlei",port=3306,charset='utf8',db='keywords')
cursor = db.cursor()
addCursor(cursor)

In [12]:
# sql('flush tables;')

In [32]:
yd.close()
yd.start()

In [13]:
yd.getSynonyms('kagomé啨晶格')

{'kagome 啨 lattice'}

In [14]:
yd.getSynonyms('横断検索')

{'横向搜索'}

In [15]:
yd.getSynonyms('刀位数据')

{'cl data', 'cutter data', 'cutter location data'}

In [16]:
yd.getSynonyms('2-偕二硝甲基-5-硝基四唑4-氨基-1 2 4-三唑盐')

{'2 - bisphosphonate methyl nitrate - 5 - nitro tetrazolium 4 - amino - 1 2 4 - triazole salt',
 '2 amido methyl 5 nitroteazole 4 amino 1 2 4 triazole salt',
 '2 bisphosphonate methyl nitrate 5 nitro tetrazolium 4 amino 1 2 4 triazole salt',
 '2-amido-methyl-5-nitroteazole 4-amino-1 2 4-triazole salt'}

In [18]:
yd.getSynonyms('气体绝缘金属封闭输电线路', 'ch2en')

{'gas insulated metal enclosed transmission lines',
 'gas insulated metal-enclosed transmission lines'}

In [21]:
yd.getSynonyms('PID')

{'pid', '新闻处', '比例 积分 微分', '比例-积分-微分', '管路及仪表布置图'}

In [22]:
sInsert = '''
insert into `keywords`.`synonym` (`idChinese`, `idEnglish`)
values %s;
'''

In [23]:
def insertWord (word):
    sql(f'''
        insert into `keywords`.`keywords` (`id`, `keywords`)
        values
        (hand_with_beihang.gen_ticket64(0),
        {repr(word)});
        ''')
    db.commit()

In [24]:
def hasWord (word) -> bool:
#     sql('flush tables;')
    rst = sql(f'''
        select idEnglish from
        (
            select id from keywords where keywords = {repr(word)}
        ) as a
        join synonym on a.id = synonym.idChinese;
    ''')
    return len(rst) != 0

In [25]:
def getWordId (word) -> int:
#     sql('flush tables;')
    rst = select('id','`keywords`.`keywords`',where=f'`keywords`={repr(word)}')
    if not len(rst):
        insertWord(word)
#         sql('flush tables;')
        rst = select('id','`keywords`.`keywords`',where=f'`keywords`={repr(word)}')
    return rst.values.item()

In [26]:
lastTrans = 0
transGap = 15 ###两次调用有道翻译的间隔
iRetry = 0
def getSyns (word, mode = ''):
    global lastTrans
    global iRetry
    
    wid = getWordId(word)
    if not hasChinese(word): return
    while time() < lastTrans + transGap: sleep(0.25)
    
    while 1:
#         iRetry = 0
        try: syns = yd.getSynonyms(word, mode)
        except KeyboardInterrupt as e: raise e
        except:
            if not iRetry: print()
            wait(60, f'({word})network error, waiting')
            iRetry += 1
            print(f'network error, retrying{iRetry}   ', end='\r')
            yd.close()
            yd.start()
        else: break
    lastTrans = time()
    for syn in syns:
        if hasChinese(syn):
            ourError(syns, f'{repr(word)} syn with chinese')
        if '.' in syn:
            ourError(syns, f'{repr(word)} syn with .')
        if '（' in syn or '）' in syn:
            ourError(syns, f'{repr(word)} syn with （ or ）')
    if len(syns) == 0: return []
    if len(syns) == 1: ###繁体先转简体，然后再走一遍同义词
        syn = [*syns][0]
        if syn == word:
            ourError(syns, f'{repr(word)} same as syns')
            if mode == 'ch2en': return []
            else: return getSyns(word, 'ch2en')
        if hasChinese(syn):
            rst = getSyns(syn)
            if not hasWord(syn) and syn.count(' ') < 5:
                return rst + [(wid, y) for x, y in rst]
            else:
                return [(wid, y) for x, y in rst]
    return [(wid, getWordId(syn)) for syn in syns]

In [27]:
def insertSyns (ids):
    sIns = sInsert % ','.join([str(x) for x in ids])
    sql(sIns)
    db.commit()

In [28]:
def frmt (s):
    global iRetry
    if iRetry: print()
    print (f'({getNow()})', s, ' ' * 20, end='\r')
    iRetry = 0

In [29]:
blacklist = {'na觙vecd4', '18 氟代脱氧葡萄糖正电子发射型断层扫描/计算机断层扫描', '白疕病', 
            '性学觕述',  }
def banWord (s : str) -> bool:
    return s in blacklist

In [30]:
def cleanKeywords (s):
    s = rmUe(s)
    s = re.sub(r'。，、', ' ', s)
    s = re.sub(r'\s+', ' ', s)
    return s.strip()

In [31]:
def solve (word):
    word = cleanKeywords(word)
    if hasWord(word): return
    if banWord(word): return
    
    frmt('get and insert syns')
    start = time()

    frmt(f'get and insert syns : {repr(word)}')
        
    ids = getSyns(word)
    if len(ids) == 0: return
    insertSyns(ids)
    ourLog(f'{time() - start:.3f}s', 'syn time')

In [32]:
start = time()
for word in tqdm(words):
    if len(word) & 1: solve (word)
sendEmail(f'cost time {convertSeconds(time() - start)}', 'get selected syn accomplished(not server)')

HBox(children=(FloatProgress(value=0.0, max=187172.0), HTML(value='')))

(2020-12-20 10:58:58) get and insert syns : 'boxcar滤波器'                                               


In [31]:
db.close()

In [32]:
yd.close()