## Base Funcs

In [1]:
import pandas as pd
import jieba
import itertools
from tqdm.auto import tqdm
import json
from collections import defaultdict



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# !pip install jieba
# !pip install emoji
import jieba
def segmentate_sentence(sent):
    """
    word segmentation with jieba
    ref : https://github.com/fxsjy/jieba
    Args :
        sent(Str) : a sentence without segmentation, e.g "Iloveyou"
    Output :
        sent(str) : segmentated sentence, e.g "I love you"
    """
    # sent = clean(sent)
    seg_list = jieba.lcut(sent, cut_all=False)
    
    seg_list = [i for i in seg_list if i!=" "]
    return seg_list
    
def clean(x):
    """clean tweets
    :param: a tweet (str)
    :return a preprocessed tweet (str)
    """
    import emoji
    from collections import Counter
    import re
    emojis = ''.join(emoji.UNICODE_EMOJI.keys())
    pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
    username_pattern = re.compile(
        r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)")
    url_pattern = re.compile(
        r"[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?")

    chinese_pattern=re.compile(r"[^\u4e00-\u9fa5]+")
#     x = username_pattern.sub('', x)
#     x = url_pattern.sub('', x)
    x = chinese_pattern.sub('', x)
#     x = re.sub(r'[0-9]+', '', x)
#     table = x.maketrans({ '/': ' ',  '#': ' ', '@':' ', ':':' '})
#     x = x.translate(table)
    x = re.sub(pattern, '', x)
#     x = re.sub(r"[^a-zA-Z0-9]","",x) # 특수문자 제거
    x = x.strip()
    return x
    

## Jieba

In [82]:
#from xlsx to csv
def make_user_dict(xlsx_path, path) :
    """
    make a custom dictionary for jieba
    It makes a txt file to communicate with jieba
    xlsx_path : A path of dictionary
    path : A result path of dictionary
    """
    df = pd.read_excel(xlsx_path)
    with open(path, "w") as f :
        f.writelines('\n'.join(df.loc[:,'words']))
    f.close()
    
def add_words(raw_paths, existing_data=None) :
    """
    add new data with existing data
    input :
        raw_paths : list of path in train_data folder
        existing_data : dataframe - existing data, if not given, it initalize the dataset
    output :
        set of words
    """
    # # !pip install googletrans==3.1.0a0
    # # ref) https://stackoverflow.com/questions/52455774/googletrans-stopped-working-with-error-nonetype-object-has-no-attribute-group
    # # ref) https://pypi.org/project/googletrans/
    # from googletrans import Translator
    # gmt = Translator()
    # gmt.translate(word, dest='ko').text

    bag_of_words = []
    cnt_words = []
    meanings = []
    positions = []

    if existing_data is not None :
        bag_of_words = list(existing_data.loc[:,'words'])
        cnt_words = list(existing_data.loc[:,'cnt'])
        meanings = list(existing_data.loc[:,'meaning'])
        positions = list(existing_data.loc[:,'position'])

    for path in raw_paths :
        df = pd.read_csv(f"train_data/{path}.csv")

        for row in tqdm(range(df.shape[0])) :
            for word in segmentate_sentence(df['English title'][row]) :
                if word in bag_of_words :
                    cnt_words[bag_of_words.index(word)]+=1
                else :
                    bag_of_words.append(word)
                    cnt_words.append(1)
                    res = translate(word)
                    meanings.append(json.loads(res)['message']['result']['translatedText'])
                    positions.append("")

    df = pd.DataFrame(list(zip(bag_of_words, meanings, positions, cnt_words)),columns =['words', 'meaning', 'position','cnt'])
    return df.sort_values(by='cnt', ascending=False).reset_index(drop=True)

def translate(target) :
    import os
    import sys
    import urllib.request
    # from Local import client_id, client_secret
    client_id = "sy3GuTVVRhzvp2xC8vVM" # 개발자센터에서 발급받은 Client ID 값
    client_secret = "D1CqkDNhAq" 
    encText = urllib.parse.quote(target)
    data = "source=zh-CN&target=ko&text=" + encText
    url = "https://openapi.naver.com/v1/papago/n2mt"
    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id",client_id)
    request.add_header("X-Naver-Client-Secret",client_secret)
    response = urllib.request.urlopen(request, data=data.encode("utf-8"))
    rescode = response.getcode()
    if(rescode==200):
        response_body = response.read()
        return response_body.decode('utf-8')
    else:
        print("Error Code:" + rescode)

def retrain(txt_path, ref_path) :
    """
    update the dictionary with txt file.
    if line startswith '`' : update with this
    input :
        txt_path : updated contents
        ref_path : reference dictionary's path
    """
    print("#####Retrain Started#####")
    dic = defaultdict(str)
    position = defaultdict(str)
    reference= pd.read_excel(ref_path)
    reference = reference.set_index('words', drop=True)
    changed=[]
    cnt=0
    with open(txt_path, "r") as f:
        li = f.readlines()
        li = [i.strip() for i in li]

    for word in li :
        if not word.startswith('---') and len(word.strip())>1 :
            if word.endswith('₩') :
                word = word[:-1]
                key, item, pos = word.split('|')
                dic[key.strip()] = item.strip()
                position[key.strip()] = pos.strip()
    for key in dic.keys() :
        if key in reference.index :
            if reference.loc[key, 'meaning'] != dic[key] :
                print(f"{key} : {reference.loc[key, 'meaning']}->", end="")
                reference.loc[key, 'meaning'] = dic[key]
                print(f"{reference.loc[key, 'meaning']}")
                cnt+=1
            if reference.loc[key, 'position'] != position[key] :
                print(f"{key} : {reference.loc[key, 'position']}->", end="")
                reference.loc[key, 'position'] = position[key]
                print(f"{reference.loc[key, 'position']}")
                cnt+=1

        else :
            print(key)
            if position[key] != 'None' :
                reference.loc[key] = [dic[key], position[key],1]
                print(f"{key} : {list(reference.loc[key, :])} ")
                cnt+=1


    print(f"data has been updated {cnt} times")
    reference.to_excel(ref_path)  
    print("#####Retrain Ended#####")

    return reference

# reference= pd.read_excel("./train_data/reference.xlsx")
# reference = reference.set_index('words', drop=True)
# reference = retrain("./working/new_data.txt", "./train_data/reference.xlsx")


#####Retrain Started#####
data has been updated 0 times
#####Retrain Ended#####


In [53]:
#######Add position of words which are in the given set#####
def get_key_words(txt_path) :
    with open(txt_path, "r") as f :
        li = [i.strip() for i in f.readlines()]
    return li

def give_position(set_of_words, position, reference) :
    """
    Add position information to reference file
    """
    for idx in reference.index :
        word = reference.loc[idx,'meaning']
        word = str(word)
        flag=False
        for chaidx in range(len(str(word))) :
            if word[chaidx] in set_of_words :
                flag=True
            if chaidx != len(word)-1 : ### You should renew here
                if word[chaidx:chaidx+2] in set_of_words :
                    flag=True
        if flag :
            reference.loc[idx,'position'] = position
            print(idx, reference.loc[idx,'meaning'], "-->", position)
    
    return reference

txt_path = "./keywords/kinds.txt"
position = 1
reference= pd.read_excel("./train_data/reference.xlsx")
reference = reference.set_index('words', drop=True)
reference = reference.dropna(axis=0)
reference = give_position(get_key_words(txt_path), position, reference)
reference.to_excel("./train_data/reference.xlsx")

# for idx in reference.index :
#     print(idx)
#     try :
#         if reference.loc[idx, 'position'] == 'None' :
#             reference = reference.drop([idx])
#     except :
#         reference = reference.drop([idx])

上衣 상의 --> 1
外套 재킷 --> 1
T恤 티셔츠 --> 1
裤 팬츠 --> 1
裤子 팬츠 --> 1
牛仔裤 청바지 --> 1
针织 니트 --> 1
长裤 긴 바지 --> 1
衫 셔츠 --> 1
衬衣 셔츠 --> 1
短裤 반바지 --> 1
针织衫 니트 --> 1
西装裤 정장 바지 --> 1
休闲裤 캐주얼 바지 --> 1
运动裤 트레이닝 바지 --> 1
风衣 트렌치코트 --> 1
卫裤 맨투맨 바지 --> 1
短外套 쇼트 재킷 --> 1
连帽 후드 --> 1
筒裤 일자 바지 --> 1
背带裤 멜빵 바지 --> 1
阔腿裤 와이드 팬츠 --> 1
体恤 티셔츠 --> 1
底裤 속바지 --> 1
喇叭裤 나팔 바지 --> 1
夹克 재킷 --> 1
呢子大衣 모직 코트 --> 1
工装裤 카고 바지 --> 1
西裤 양복 바지 --> 1
热裤 핫팬츠 --> 1
裤型 바지 모양 --> 1
裤加绒 바지 기모 --> 1
裤裙 치마바지 --> 1
微喇裤 부츠컷 팬츠 --> 1
帽卫衣 후드 맨투맨 --> 1
裤卫裤 팬츠 바지 --> 1
裤外 바지 밖 --> 1
五分裤 5부 팬츠 --> 1
白衬衫 화이트 셔츠 --> 1
裤百搭女 바지저고리녀 --> 1
牛仔衣 데님 셔츠 --> 1
灯笼裤 벌룬팬츠 --> 1
女上衣 여자 상의 --> 1
貂皮大衣 밍크 코트 --> 1
t恤 티셔츠 --> 1
上衣服 상의 --> 1
单色T恤 무지티셔츠 --> 1
开衫卫衣 후드집업 --> 1
羊绒外套 캐시미어 코트 --> 1
羊皮夹克 무스탕 재킷 --> 1
毛呢连衣裙 니트 원피스 --> 1
哈伦裤 배기 바지 --> 1
钩花 플라워 니트 디자인 --> 1
直筒裤 일자 바지 --> 1


In [105]:
reference= pd.read_excel("./train_data/reference.xlsx")
reference = reference.set_index('words', drop=True)
reference = reference.dropna(axis=0)
for idx in reference.index :
    try :
        if int(reference.loc[idx, 'position']) == 0 :
            pass
    except :
        print(idx)
        reference = reference.drop([idx])


小


In [106]:
reference.to_excel("./train_data/reference.xlsx")

In [108]:
##Start Working##
def forward(data_path, train_data, user_dict, start) :
    df = pd.read_csv(data_path)
    #########LOAD TRAINED SECTION##########
    make_user_dict(train_data, user_dict)
    jieba.load_userdict(user_dict) # load customized dictionary
    reference= pd.read_excel(train_data)
    reference = reference.set_index('words', drop=True)
    reference = reference.dropna(axis=0)
    #########LOAD TRAINED SECTION##########
    with open("./working/auto.txt", "w") as h :
        with open("./working/new_data.txt", "w")as g :
            for row in tqdm(range(start, df.shape[0])) :
                ###################Generating Section###########################
                sentence = segmentate_sentence(df['English title'][row])
                translated = []
                buff = defaultdict(str)
                key_words= ['2022', '신상', '2022신상', '22', '20', '년', 2022] ## For Extra section
                season=""
                new_flag=False
                for word in segmentate_sentence(df['English title'][row]) :
                    if word in reference.index :
                        if int(reference.loc[word, 'position']) ==11 : ##Check Season
                            season = reference.loc[word,'meaning']
                        buff[word] = list(reference.loc[word, :])
                        translated.append(str(buff[word][0]))
                    else :
                        res = translate(word)
                        res = json.loads(res)['message']['result']['translatedText']
                        buff[word] = [res,None,1]
                        translated.append(buff[word][0])
                ###################Generating Section###########################

                ###################Extra Section###########################
                    if new_flag==False :
                        if buff[word][0] in key_words :        
                            new_flag=True

                translated = [word for word in translated if (word not in key_words)]
                ###################Extra Section###########################

                ###################Writing Section###########################
                h.write(f"{row+1}.\n")
                h.write(f"{' '.join(sentence)}\n")
                h.write(f"{' '.join(translated)}, ")
                if new_flag :
                    if season == "" :
                        h.write(f"2022 신상\n")
                    else :
                        h.write(f"2022 {season} 신상\n")
                else :
                    h.write("\n")
                g.write(f"\n-------------------{row+1}------------\n")
                for key, item in buff.items() :
                    g.write(f"{key}\t|\t{item[0]}\t|\t{item[1]}\n")
                ###################Writing Section###########################
        g.close()
        print("auto.txt has successfully saved as ./working/auto.txt")
    h.close()
    print("new_data.txt has successfully saved as ./working/new_data.txt")
    print("Everything has completed, Go to work")

data_path="working/work.csv"
train_data="./train_data/reference.xlsx"
user_dict="./train_data/userdict.txt"

retrain("./working/new_data.txt", "./train_data/reference.xlsx")
forward(data_path, train_data, user_dict, 55)

#####Retrain Started#####
若语
若语 : ['Ruoyu', '0', 1] 
暗纹
暗纹 : ['히든 디자인', '9', 1] 
瞌睡兔
瞌睡兔 : ['Keshhuitu', '0', 1] 
韩风
韩风 : ['한국 스타일', '9', 1] 
data has been updated 4 times
#####Retrain Ended#####


100%|██████████| 45/45 [00:11<00:00,  3.83it/s]

auto.txt has successfully saved as ./working/auto.txt
new_data.txt has successfully saved as ./working/new_data.txt
Everything has completed, Go to work





## Generating - word

One hot encoding including position infos.

meaning, category, position info

1. give main meaning
2. add meaning
3. add position meaning

words - word - meaning - cate - 
1
2
3
4

In [None]:
###Opn other files and save on this table too###
###No overlap