### 库

In [1]:
import json, re
from time import time, sleep
from datetime import datetime

In [2]:
import pandas as pd
import numpy as np
import os, sys

In [3]:
#from tqdm.notebook import tqdm, trange, tnrange
from vUtil.vTqdm import tqdm, trange
from vVariableInspector import _getshapeof
from vUtil.vFile import fprint, readlines, linesReader
from vUtil.vEmail import sendEmail
from vUtil.vTime import getNow, convertSeconds, getToday
from vUtil.vUe import rmUe
from vUtil.debug import debug

In [4]:
from vMysql import MysqlProxy
from youdaoSynonym import YoudaoSynonym

In [5]:
from vToEnglish.detect import hasChinese, countChinese
# from vToEnglish.translate import translate
from vToEnglish.toEnglish import wait, toEnglish

In [6]:
from simpleAndTranditional import trd2smp
from keywords.selected import kwSelected as words

In [7]:
from vUtil.vLog import frmt, print, _print

### utils

In [8]:
def ourError (error, errorType = 'invalid keyword'):
    fprint(f'[ERROR] ({getNow()}) {repr(word)} : {errorType}: "{error}"\n', file=f'error{getToday()}.txt', path='error')

In [9]:
def ourLog (log, logType = ''):
    fprint(f'[LOG] ({getNow()}) {repr(word)} : {logType} : "{log}"\n', file=f'log{getToday()}.txt', path='log')

In [9]:
def frmt (*args):
    global iRetry
    if iRetry: print()
    print (f'({getNow()})', *args, ' ' * 20, end='\r')
    iRetry = 0

In [10]:
def cleanKeywords (s):
    s = rmUe(s)
    s = re.sub(r'。，、', ' ', s)
    s = re.sub(r'\s+', ' ', s)
    return s.strip()

### 获取同义词表

In [11]:
sInsert = '''
replace into `keywords`.`synonym` (`idChinese`, `idEnglish`)
values %s;
'''

In [12]:
def insertSyns (ids):
    sIns = sInsert % ','.join([str(x) for x in ids])
    db.sql(sIns, True)

In [13]:
def insertWord (word):
    db.sql(f'''
        insert into `keywords`.`keywords` (`id`, `keywords`)
        values
        (landinn.gen_ticket64(1),
        {repr(word)});
        ''', True)

In [14]:
def hasWord (word) -> bool:
#     sql('flush tables;')
    rst = db.sql(f'''
        select idEnglish from
        (
            select id from keywords.keywords where keywords = {repr(word)}
        ) as a
        join keywords.synonym as b on a.id = b.idChinese;
    ''', True)
    return len(rst) != 0

In [15]:
def getWordId (word) -> int:
#     sql('flush tables;')
    rst = db.select('id','`keywords`.`keywords`',where=f'`keywords`={repr(word)}')
    if not len(rst):
        insertWord(word)
#         sql('flush tables;')
        rst = db.select('id','`keywords`.`keywords`',where=f'`keywords`={repr(word)}')
    return rst.values.item()

In [16]:
def insertSyn (ch, en):
    ids = [(getWordId (ch), getWordId(en))]
    insertSyns(ids)
    return ids

In [21]:
insertSyn('大数据','大数数据')

[(1032952127932334084, 1129305859648978945)]

In [24]:
insertSyn('曲志','我爱你')

[(1128905306662043649, 1128905427499941889)]

In [2]:
import pymysql

In [3]:
pymysql.__version__

'0.10.1'

In [16]:
lastTrans = 0
transGap = 15 ###两次调用有道翻译的间隔
iRetry = 0
def getSyns (word, en=None, mode = ''):
    global lastTrans
    global iRetry
    
    wid = getWordId(word)
    if not hasChinese(word): return []
    while time() < lastTrans + transGap: sleep(0.25)
    
    while 1:
#         iRetry = 0
        try: syns = yd.getSynonyms(word, mode)
        except KeyboardInterrupt as e: raise e
        except:
#             if not iRetry: print()
            wait(60, f'({repr(word)})network error, waiting')
            iRetry += 1
            frmt(f'({repr(word)})network error, retrying{iRetry}   ', end='\r')
            yd.close()
            yd.start()
        else: 
            if iRetry: 
                print()
                ourError(f'retrying{iRetry}', 'network error')
                iRetry = 0
            break
    lastTrans = time()
    for syn in syns:
        if hasChinese(syn):
            ourError(syns, f'{repr(word)} syn with chinese')
        if '.' in syn:
            ourError(syns, f'{repr(word)} syn with .')
        if '（' in syn or '）' in syn:
            ourError(syns, f'{repr(word)} syn with （ or ）')
    if len(syns) == 0: return []
    if len(syns) == 1: ###繁体先转简体，然后再走一遍同义词
        syn = [*syns][0]
        if syn == word:
            ourError(syns, f'{repr(word)} same as syns')
            if mode == 'ch2en': return []
            else: return getSyns(word, en, 'ch2en')
        if hasChinese(syn):
            rst = getSyns(syn, en)
            if not hasWord(syn) and syn.count(' ') < 5:
                return rst + [(wid, y) for x, y in rst]
            else:
                return [(wid, y) for x, y in rst]
    
    syns = syns | (set() if en is None else {en})
    syns = yd._YoudaoSynonym__filter(syns)
    return [(wid, getWordId(syn)) for syn in syns]

In [17]:
blacklist = {'na觙vecd4', '18 氟代脱氧葡萄糖正电子发射型断层扫描/计算机断层扫描', '白疕病', 
             '性学觕述',  '目\uf8db\uf8f5', '心中\uf8e8\uf8f5\uf8e8\uf8f5大动',}
def banWord (s : str) -> bool:
    if '?' in s or '？' in s: return 1 
    return s in blacklist

In [18]:
def solve (word, en=None, tq=None):
    word = cleanKeywords(word)
    #if hasWord(word): return
    if banWord(word): return
    
    frmt('get and insert syns', tqdm=tq, end='\r')
    start = time()

    frmt(f'get and insert syns : {repr(word)}', tqdm=tq, end='\r')
        
    ids = getSyns(word, en)
    if len(ids) == 0: return
    insertSyns(ids)
    ourLog(f'{time() - start:.3f}s', 'syn time')

### 获取词表

In [19]:
schema = 'field_term'

In [20]:
db = MysqlProxy(ip='10.208.63.47',user='root',password='linlei',db=schema)

In [14]:
db.sql('show tables;')

Unnamed: 0,Tables_in_field_term
0,field
1,term


In [22]:
fields = db.select('name', 'field_term.field')['name']

In [23]:
def getTerms (start, size):
    return db.select('name_cn, name_en', 'field_term.term', start, start + size).values

In [24]:
yd = YoudaoSynonym()

In [12]:
# sql('flush tables;')

### 测试youdao

In [52]:
yd.close()
yd.start()

In [39]:
yd.getSynonyms('kagomé啨晶格')

{'kagome memin crystalline lattice', 'kagome 啨 lattice'}

In [25]:
yd.getSynonyms('原始方程模式')

{'original equation model', 'primitive equation model'}

In [25]:
yd.getSynonyms('刀位数据')

{'cl data', 'cutter location data', 'tool position data'}

In [30]:
yd.getSynonyms('2-偕二硝甲基-5-硝基四唑4-氨基-1 2 4-三唑盐')

{'2 - bisphosphonate methyl nitrate - 5 - nitro tetrazolium 4 - amino - 1 2 4 - triazole salt',
 '2 bisphosphonate methyl nitrate 5 nitro tetrazolium 4 amino 1 2 4 triazole salt',
 '2 diazole 5 nitrotetrazole 4 amino 1 2 4 triazole salt',
 '2- diazole-5-nitrotetrazole-4-amino-1 2 4-triazole salt'}

In [32]:
yd.getSynonyms('气体绝缘金属封闭输电线路', 'ch2en')

{'gas insulated metal closed transmission line',
 'gas insulated metal enclosed transmission lines',
 'gas insulated metal-enclosed transmission lines'}

In [33]:
yd.getSynonyms('PID')

{'pid', '新闻处', '比例 积分 微分', '比例-积分-微分', '管路及仪表布置图'}

### 同义词fields

In [36]:
start = time()
tq = tqdm(fields, ncols=ncols)
for word in tq:
    tq.set_description(f'{getNow()}')
    #if len(word) & 1: continue
    solve (word, tq=tq)
sendEmail(f'cost time {convertSeconds(time() - start)}', 'get fields syn accomplished(not server)')

HBox(children=(FloatProgress(value=0.0, max=221.0), HTML(value='')))

(2021-05-20 16:12:56) get and insert syns : '语言学'                                 


### 同义词terms

In [26]:
now = 0
bulk = 20
nTerms = db.count('field_term.term').values.item()

In [27]:
now = 219920

In [28]:
nTerms = 229840

In [31]:
debug(now, bulk, nTerms)

<ipython-input-31-c81a87d369c1> line 1 : now = 229840, bulk = 20, nTerms = 230000


In [32]:
start = time()

with trange(now, nTerms, bulk) as tr:
    for i in tr:
        frmt(f':{i}', tqdm=tr)
        now = i
        if not os.path.exists('log/logTerms.csv'):
            fprint(f'time,now', file='logTerms.csv', path='log')
        fprint(f'{getNow()},{now}', file='logTerms.csv', path='log')
        with tqdm(getTerms(now, bulk), leave=False) as tq:
            for word, en in tq:
                frmt(tqdm=tq)
            #     if len(word) & 1: continue
                solve (word, en, tq=tq)
sendEmail(f'cost time {convertSeconds(time() - start)}', 'get terms syn accomplished(not server)')

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=8.0), HTML(value='')), layout=Layout(disp…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=20.0), HTML(value='')), layout=Layout(dis…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=20.0), HTML(value='')), layout=Layout(dis…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=20.0), HTML(value='')), layout=Layout(dis…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=20.0), HTML(value='')), layout=Layout(dis…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=20.0), HTML(value='')), layout=Layout(dis…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=20.0), HTML(value='')), layout=Layout(dis…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=20.0), HTML(value='')), layout=Layout(dis…

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=20.0), HTML(value='')), layout=Layout(dis…




In [None]:
errorTermId = [876, 1048, 1471, 1956, 2558, 2993, 3529, 3820, 4703, 4740, 4825, 4860, 4861, 4902, 4903, 5208, 5393, 5402, 5407, 5775, 6446, 7532, 8002, 8321, 8684, 8777, 8877, 8893, 8902, 8932, 8980, 9021, 9022, 9023, 9024, 9118, 9137, 9160, 9163, 9164, 9280, 9307, 9403, 9931, 262357, 266738, 278340, 278341, 278568]

### 关闭

In [35]:
db.close()

In [36]:
yd.close()