In [1]:
import re
import jieba
import numpy as np
from gensim.models import KeyedVectors


from utils import ColorfulClothes

In [2]:
word2vec = KeyedVectors.load_word2vec_format('./merge_sgns_bigram_char300.txt', binary=False)

In [3]:
# Reference: https://blog.csdn.net/bailixuance/article/details/89555580
def find_chinese(file):
    pattern = re.compile(r'[^\u4e00-\u9fa5]')
    chinese = re.sub(pattern, '', file)
    return chinese

In [4]:
dataset = ColorfulClothes('./dataset/medium/', train=True)
dataset_test = ColorfulClothes('./dataset/medium/', train=False)


threshold = 0.45
color = '颜色'
redundancy = []
out_of_dict = []

def white_list(word):
    return '色' in word or '绿' in word or '青' in word or '黑' in word or '白' in word or '黄' in word

for images, labels in dataset:
    opt_tags = labels['optional_tags']
    for tags in opt_tags:
        tags = find_chinese(tags)
        words = jieba.lcut(tags, cut_all=True)
        for word in words:
            try:
                cov = word2vec.similarity(color, word)
                if cov < threshold and not white_list(word):
                    # redundancy.append(word + 'in' + tags)
                    redundancy.append(word)
            except:
                out_of_dict.append(word)


for images, labels in dataset_test:
    opt_tags = labels['optional_tags']
    for tags in opt_tags:
        tags = find_chinese(tags)
        words = jieba.lcut(tags, cut_all=True)
        for word in words:
            try:
                cov = word2vec.similarity(color, word)
                if cov < threshold and not white_list(word):
                    # redundancy.append(word + 'in' + tags)
                    redundancy.append(word)
            except:
                out_of_dict.append(word)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/0w/j3hd49492bncw4zlczqbyrmr0000gn/T/jieba.cache
Loading model cost 0.313 seconds.
Prefix dict has been built successfully.


In [5]:
redundancy = list(set(redundancy))
out_of_dict = list(set(out_of_dict))
print('Length of redundancy:', len(redundancy))
print('Length of out of dictionary:', len(out_of_dict))

Length of redundancy: 1955
Length of out of dictionary: 48


In [6]:
np.save('redundancy.npy', redundancy)
np.save('out_of_dict.npy', out_of_dict)

In [21]:
word_test = '现货'
print(word_test in redundancy)
print(word2vec.similarity(word_test, '颜色'))

word_test2 = '牛油果'
print(word2vec.similarity(word_test2, '浅绿'))

True
0.24938019
0.29219952


In [8]:
print('Redundancy:\n', redundancy)
print('Out of dictionary:\n', out_of_dict)

Redundancy:
 ['味儿', '单面', '抽象', '单条', '金鼓', '赤红', '熏烟', '磁铁', '不分', '越', '短', '天轮', '星辰', '凤尾', '明眼', '蒸汽', '蒂', '分', '均', '脚底', '小笨蛋', '蓬蓬', '香蕉', '猴子', '条', '芭比', '砖', '枚', '天内', '二件套', '支持', '沁', '烘焙', '袖长', '撞', '着', '金橘', '香芋', '蓟', '假', '衣领', '蕊', '染上', '批次', '蜜桔', '法国', '斜纹', '微', '锈', '元气', '瀑布', '喇叭裤', '欧洲', '大网', '裤装', '一见钟情', '雾霭', '贷', '三角', '涂鸦', '然', '加开', '袖', '风尚', '大利', '逸', '晕染', '底壳', '风衣', '兜', '克莱', '紫花', '奥兰', '小众', '单独', '砂', '丁香', '优先', '紫藤', '新', '思', '豆粉', '发货', '粗', '森', '浅', '酸奶', '叶子', '售罄', '子棉', '啡', '尼斯', '梦', '防晒', '门禁', '海红', '蛋', '单卡', '蔓', '个性', '建议', '记', '被', '带大', '栀子', '宝石', '漂', '透视', '琳', '琉', '边上', '玫', '汽车', '花篮', '泥金', '金盏花', '孤', '拼', '萌', '小姐', '两件套', '小鸭', '猪', '外强', '塞拉', '无货', '胸前', '花蕾', '汉服', '柑橘', '岩石', '今日', '柄', '退换', '棉质', '比', '赭石', '苔藓', '工装', '代尔', '衣', '丝', '头', '蝶粉', '文胸', '绣', '可可粉', '嫩草', '嫩肤', '薯', '缺货', '心花', '接连', '深空', '鱼鳞', '金盏', '组合', '烟草', '番茄', '无情', '胸针', '女红', '瑞', '兰花', '熏', '肤', '咸菜', '装酒', '吊带', '束', '子夜', '姐姐'