In [None]:
import os
import sys

import re
import random

import pickle

import jieba
from tqdm import tqdm

from glob import glob


ordered_design_keys = ['基础廓形', '肩', '门襟', '结构', '腰节', '摆', '领型']

design_vars = {
    "基础廓形": ['H型', 'X型', 'A型', 'O型', 'T恤', '紧身', '宽松', '合体'],
    "结构": ['无省', '收腰省', '袖窿公主线分割', '直线公主线分割', '公主线', '公主缝'],
    "腰节": ['无腰节', '断腰节', '拼腰节', '抽皱腰节', '单向褶', '风琴折'],
    "摆": ['无摆', '荷叶摆', '荷叶边', '木耳边', '拼接抽皱', '不规则下摆', '两侧长中间短下摆', '前短后长下摆'],
    "领型": ['有领', '无领', 'V领', '圆领', '椭圆形领', 'U型领', '马蹄型领', '缺口领', '叠领', '套装领', '打裥领', '荡领', '心型领', '船领', '梯形领', '鸡心领', '高领', '高垂领', '锁孔领', '花瓣型领', '漏斗领', '水滴领', '亨利领', '抽绳式立领', '方领', '不对称领', '钻石领', '船员领', '立领', '衬衫领', '木耳边领', '娃娃领', '海军领', '蕾丝花边领', '切尔西式领', '中式马褂型领', '开领', '离颈领', '小丑领', '燕子领', 'POLO领', '连身立领'],
    "门襟": ['无门襟', '交叉门襟', '叠门襟', '半开门襟'],
    "肩": ['正常肩', '正肩', '落肩', '连身袖', '连身插角袖插块', '连身插角袖插片', '插肩袖'],
    "袖": ['无袖', '小飞袖', '天使款大衣袖', '袖山抽皱泡泡袖', '一片袖', '两片袖', '露肩袖', '藕节袖', '羊腿袖', '漏斗袖', '喇叭袖', '宝塔袖', '淑女袖', '缺角袖', '主教服式袖', '泡泡袖', '灯笼袖', '气球型袖', '蝴蝶袖', '层叠袖', '手帕袖', '衬衫袖', '木耳袖口', '娜塔夫袖口']
}

####################### 1. Load data items (RAW) #######################
# data_roots = [
#     "\\\\192.168.29.222\\Share\\工程数据Q1\\objs", 
#     # "\\\\192.168.29.222\\Share\\工程数据Q2_objs"
#     ]
# all_items = []
# print(">>> Locating data items...")
# for data_root in data_roots:
#     cur_items = glob(os.path.join(data_root, '**', 'pattern.json'), recursive=True)
#     cur_items = [os.path.dirname(x) for x in cur_items]
#     all_items += cur_items
#     print("%d items in %s."%(len(cur_items), data_root))
# print('[DONE] locating data items, total %d items.'%(len(all_items)))
########################################################################

####################### 2. Load data items (PICKLE) #######################
# data_root = "E:\\lry\\data\\AIGP\\demo_v2\\Q2\\brep_uni_norm\\"
# data_dirs = glob(os.path.join(data_root, '*.pkl'))
# all_items = []
# for data_item in tqdm(data_dirs):
#     with open(data_item, "rb") as tf: data = pickle.load(tf)
#     data_fp = os.path.basename(data['data_fp'])
#     data_fp = re.sub(r'\d+', '', data_fp).replace('-', '').replace('_', '').strip().upper()
#     all_items.append(data_fp)
# print('[DONE] Loading data items: ', len(all_items), all_items[:10])
###########################################################################


####################### 3. Load data items (JSON) #######################
data_root = "E:\\lry\\data\\AIGP\\demo_v2\\Q2\\patterns\\"
data_dirs = glob(os.path.join(data_root, '*.json'))
all_items = []
for data_item in tqdm(data_dirs):
    with open(data_item, "rb") as tf: data = pickle.load(tf)
    data_fp = os.path.basename(data['data_fp'])
    data_fp = re.sub(r'\d+', '', data_fp).replace('-', '').replace('_', '').strip().upper()
    all_items.append(data_fp)
print('[DONE] Loading data items: ', len(all_items), all_items[:10])
###########################################################################


print('>>> Try word cutting...')
jieba.load_userdict('dict.txt')
total_words = []
with open('word_cut_result.txt', 'w') as f:
    for sentence in tqdm(all_items):
        words = jieba.lcut(sentence, cut_all=False)
        total_words += words
        f.write("%s => %s\n" % (sentence, '|'.join(words)))
print('[DONE] Total words after cut: ', len(total_words))

# total_words = [x.upper() for x in total_words]
# total_words_freq = dict([(x, total_words.count(x)) for x in total_words if x.strip()])
# total_words_freq = sorted(total_words_freq.items(), key=lambda x: x[1], reverse=True)    

# design_var_dict = {}
# for key in design_vars:
#     for val in design_vars[key]:
#         design_var_dict[val] = key

# totel_words_freq_category = {"其他": []}
# for word, freq in total_words_freq:
#     word = word.strip()
#     if not word: continue
#     elif word.upper() in design_var_dict:
#         if design_var_dict[word] not in totel_words_freq_category: totel_words_freq_category[design_var_dict[word]] = []
#         totel_words_freq_category[design_var_dict[word]].append((word, freq))
#     else:
#         totel_words_freq_category["其他"].append((word, freq))

# with open('word_freq.txt', 'w') as f:
#     for key in design_vars.keys():
#         f.write(key+':\n')
#         for word, freq in totel_words_freq_category[key]:
#             if word.strip() == '': continue
#             f.write('\t'+word+' '+str(freq)+'\n')
#         f.write('\n')
#     f.write("其他:\n")
#     for word, freq in totel_words_freq_category["其他"]:
#         if word.strip() == '': continue
#         if freq < 10: continue
#         f.write('\t'+word+' '+str(freq)+'\n')

In [9]:
import os
import sys

import re
import random

import json
import pickle

import jieba
from tqdm import tqdm

from glob import glob


ordered_design_keys = ['基础廓形', '肩', '门襟', '结构', '腰节', '摆', '领型']

design_vars = {
    "基础廓形": ['H型', 'X型', 'A型', 'O型', 'T恤', '紧身', '宽松', '合体'],
    "结构": ['无省', '收腰省', '袖窿公主线分割', '直线公主线分割', '公主线', '公主缝'],
    "腰节": ['无腰节', '断腰节', '拼腰节', '抽皱腰节', '单向褶', '风琴折'],
    "摆": ['无摆', '荷叶摆', '荷叶边', '木耳边', '拼接抽皱', '不规则下摆', '两侧长中间短下摆', '前短后长下摆'],
    "领型": ['有领', '无领', 'V领', '圆领', '椭圆形领', 'U型领', '马蹄型领', '缺口领', '叠领', '套装领', '打裥领', '荡领', '心型领', '船领', '梯形领', '鸡心领', '高领', '高垂领', '锁孔领', '花瓣型领', '漏斗领', '水滴领', '亨利领', '抽绳式立领', '方领', '不对称领', '钻石领', '船员领', '立领', '衬衫领', '木耳边领', '娃娃领', '海军领', '蕾丝花边领', '切尔西式领', '中式马褂型领', '开领', '离颈领', '小丑领', '燕子领', 'POLO领', '连身立领'],
    "门襟": ['无门襟', '交叉门襟', '叠门襟', '半开门襟'],
    "肩": ['正常肩', '正肩', '落肩', '连身袖', '连身插角袖插块', '连身插角袖插片', '插肩袖'],
    "袖": ['无袖', '小飞袖', '天使款大衣袖', '袖山抽皱泡泡袖', '一片袖', '两片袖', '露肩袖', '藕节袖', '羊腿袖', '漏斗袖', '喇叭袖', '宝塔袖', '淑女袖', '缺角袖', '主教服式袖', '泡泡袖', '灯笼袖', '气球型袖', '蝴蝶袖', '层叠袖', '手帕袖', '衬衫袖', '木耳袖口', '娜塔夫袖口']
}


jieba.load_userdict('dict.txt')

####################### 3. Load data items (JSON) #######################
data_root = "E:\\lry\\data\\AIGP\\demo_v2\\Q2\\patterns\\"
output_root = "E:\\lry\\data\\AIGP\\demo_v2\\Q2\\patterns_with_caption\\"
os.makedirs(output_root, exist_ok=True)
data_dirs = glob(os.path.join(data_root, '*.json'))
all_items = []
for data_item in tqdm(data_dirs):
    try:
    
        with open(data_item, "r") as tf: pattern_spec = json.load(tf)
        data_caption = os.path.basename(pattern_spec["raw_data_fp"])
        data_caption = re.sub(r'\d+', '', data_caption).replace('-', '').replace('_', '').strip().upper()
        data_caption = jieba.lcut(data_caption, cut_all=False)
        data_caption = [x for x in data_caption if x.strip() != "连衣裙"]
        data_caption = ["连衣裙", ", ".join(data_caption)]
            
        pattern_spec["caption"] = data_caption
        
        with open(os.path.join(output_root, os.path.basename(data_item)), "w", encoding='utf-8') as tf:
            json.dump(pattern_spec, tf, indent=4, ensure_ascii=False)
    
    except Exception as e:
        print('[ERROR] ', data_item, e)
    
###########################################################################

100%|██████████| 11487/11487 [03:36<00:00, 53.12it/s] 
