## Base Funcs

In [158]:
import pandas as pd
import jieba
import itertools
from tqdm.auto import tqdm
import json
from collections import defaultdict



In [159]:
# !pip install jieba
# !pip install emoji
import jieba
def segmentate_sentence(sent):
    """
    word segmentation with jieba
    ref : https://github.com/fxsjy/jieba
    Args :
        sent(Str) : a sentence without segmentation, e.g "Iloveyou"
    Output :
        sent(str) : segmentated sentence, e.g "I love you"
    """
    # sent = clean(sent)
    seg_list = jieba.lcut(sent, cut_all=False)
    
    seg_list = [i for i in seg_list if i!=" "]
    return seg_list
    
def clean(x):
    """clean tweets
    :param: a tweet (str)
    :return a preprocessed tweet (str)
    """
    import emoji
    from collections import Counter
    import re
    emojis = ''.join(emoji.UNICODE_EMOJI.keys())
    pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
    username_pattern = re.compile(
        r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)")
    url_pattern = re.compile(
        r"[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?")

    chinese_pattern=re.compile(r"[^\u4e00-\u9fa5]+")
#     x = username_pattern.sub('', x)
#     x = url_pattern.sub('', x)
    x = chinese_pattern.sub('', x)
#     x = re.sub(r'[0-9]+', '', x)
#     table = x.maketrans({ '/': ' ',  '#': ' ', '@':' ', ':':' '})
#     x = x.translate(table)
    x = re.sub(pattern, '', x)
#     x = re.sub(r"[^a-zA-Z0-9]","",x) # 특수문자 제거
    x = x.strip()
    return x
    

## Jieba

In [181]:
#from xlsx to csv
def make_user_dict(xlsx_path, path) :
    """
    make a custom dictionary for jieba
    It makes a txt file to communicate with jieba
    xlsx_path : A path of dictionary
    path : A result path of dictionary
    """
    df = pd.read_excel(xlsx_path)
    with open(path, "w") as f :
        f.writelines('\n'.join(df.loc[:,'words']))
    f.close()
    
def add_words(raw_paths, existing_data=None) :
    """
    add new data with existing data
    input :
        raw_paths : list of path in train_data folder
        existing_data : dataframe - existing data, if not given, it initalize the dataset
    output :
        set of words
    """
    # # !pip install googletrans==3.1.0a0
    # # ref) https://stackoverflow.com/questions/52455774/googletrans-stopped-working-with-error-nonetype-object-has-no-attribute-group
    # # ref) https://pypi.org/project/googletrans/
    # from googletrans import Translator
    # gmt = Translator()
    # gmt.translate(word, dest='ko').text

    bag_of_words = []
    cnt_words = []
    meanings = []
    positions = []

    if existing_data is not None :
        bag_of_words = list(existing_data.loc[:,'words'])
        cnt_words = list(existing_data.loc[:,'cnt'])
        meanings = list(existing_data.loc[:,'meaning'])
        positions = list(existing_data.loc[:,'position'])

    for path in raw_paths :
        df = pd.read_csv(f"train_data/{path}.csv")

        for row in tqdm(range(df.shape[0])) :
            for word in segmentate_sentence(df['English title'][row]) :
                if word in bag_of_words :
                    cnt_words[bag_of_words.index(word)]+=1
                else :
                    bag_of_words.append(word)
                    cnt_words.append(1)
                    res = translate(word)
                    meanings.append(json.loads(res)['message']['result']['translatedText'])
                    positions.append("")

    df = pd.DataFrame(list(zip(bag_of_words, meanings, positions, cnt_words)),columns =['words', 'meaning', 'position','cnt'])
    return df.sort_values(by='cnt', ascending=False).reset_index(drop=True)

def translate(target) :
    import os
    import sys
    import urllib.request
    # from Local import client_id, client_secret
    client_id = "sy3GuTVVRhzvp2xC8vVM" # 개발자센터에서 발급받은 Client ID 값
    client_secret = "D1CqkDNhAq" 
    encText = urllib.parse.quote(target)
    data = "source=zh-CN&target=ko&text=" + encText
    url = "https://openapi.naver.com/v1/papago/n2mt"
    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id",client_id)
    request.add_header("X-Naver-Client-Secret",client_secret)
    response = urllib.request.urlopen(request, data=data.encode("utf-8"))
    rescode = response.getcode()
    if(rescode==200):
        response_body = response.read()
        return response_body.decode('utf-8')
    else:
        print("Error Code:" + rescode)


# import json
# res = translate("娃娃领")
# json.loads(res)['message']['result']['translatedText']


In [4]:
# !pip uninstall googletrans==4.0.0-rc1

###Make it as word and save on table###
###table_col : mymeaning, MT_meaning, count, position

#########TRAIN SECTION##########
make_user_dict("./train_data/reference.xlsx","./train_data/userdict.txt")
jieba.load_userdict("./train_data/userdict.txt") # load customized dictionary
reference= pd.read_excel("./train_data/reference.xlsx")
reference = reference.set_index('words', drop=True)
#########TRAIN SECTION##########

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/jt/jg8f8t5d5l94gkq96yn3k41h0000gn/T/jieba.cache
Loading model cost 0.927 seconds.
Prefix dict has been built successfully.


In [216]:
## Retrain
## Retrain model with ./working/new_data.txt

def retrain(txt_path, ref_path) :
    """
    update the dictionary with txt file.
    if line startswith '`' : update with this
    input :
        txt_path : updated contents
        ref_path : reference dictionary's path
    """

    dic = defaultdict(str)
    reference= pd.read_excel(ref_path)
    reference = reference.set_index('words', drop=True)
    changed=[]
    with open(txt_path, "r") as f:
        li = f.readlines()
        li = [i.strip() for i in li]

    for word in li :
        if word.startswith('---') or word.startswith('result'):
            continue
        if word.startswith('₩'):
            try :
                key, item = word.split(',')
                dic[key.strip()[1:]] = item.strip()
            except :
                pass

    for key in dic.keys() :
        print(key)
        if key in reference.index :
            if reference.loc[key, 'meaning'] != dic[key] :
                reference.loc[key, 'meaning'] = dic[key]
                changed.append(key)
        else :
            reference.loc[key] = [dic[key], 0,1]
            changed.append(key)
    for key in changed :            
        print(f"{key} -> {reference.loc[key]['meaning']}")

    print(f"data has been updated {len(changed)} times")

    reference.to_excel(ref_path)  

retrain("./working/new_data.txt", "./train_data/reference.xlsx")

曾小咸
黑标
钢圈
乞丐
木耳边
你好
卡农
落落狷介
觅定
阔腿
直筒裤
波点
牛角扣
小个子
蓝
纯欲
灯笼袖
单
排扣
超仙
气质
通勤
特大码
透视
雪中飞
data has been updated 0 times


In [238]:
##Start Working##
###蓝语 왜 못찾는거야???? 확인, -> reference 에는 있는데 못찾아서 새로 찾는듯.
df = pd.read_csv("working/work.csv")
#########LOAD TRAIN SECTION##########
make_user_dict("./train_data/reference.xlsx","./train_data/userdict.txt")
jieba.load_userdict("./train_data/userdict.txt") # load customized dictionary
reference= pd.read_excel("./train_data/reference.xlsx")
reference = reference.set_index('words', drop=True)
reference = reference.dropna(axis=0)
#########LOAD TRAIN SECTION##########
with open("./working/auto.txt", "w") as h :
    with open("./working/new_data.txt", "w")as g :
        with open("./new_words.txt", "w") as f :
            for row in tqdm(range(df.shape[0])) :
                sentence = segmentate_sentence(df['English title'][row])
                translated = []
                buff = defaultdict(str)
                g.write(f"\n-------------------{row+1}------------\n")
                h.write(f"{row+1}.\n")
                h.write(f"{' '.join(sentence)}\n")
                for word in segmentate_sentence(df['English title'][row]) :
                    if word in reference.index :
                        buff[word] = reference.loc[word, 'meaning']
                        translated.append(str(reference.loc[word, 'meaning']))
                    else :
                        res = translate(word)
                        res = json.loads(res)['message']['result']['translatedText']
                        # res = word
                        buff[word] = res
                        translated.append(res)
                        f.write(f"{word}, {res}")
                        f.write("\n")
                h.write(f"{' '.join(translated)}\n")
                for key, item in buff.items() :
                    g.write(f"{key}, {item}\n")
        f.close()
    g.close()
h.close()


100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


In [237]:
reference.to_excel("./train_data/reference.xlsx")

## Generating - word

One hot encoding including position infos.

meaning, category, position info

1. give main meaning
2. add meaning
3. add position meaning

words - word - meaning - cate - 
1
2
3
4

In [None]:
###Opn other files and save on this table too###
###No overlap