In [1]:
import json
import numpy as np
from tqdm import tqdm
from pprint import pprint
from sklearn.cluster import KMeans

from label_processor import CStdLib

In [2]:
with open('result_logits.json', encoding='utf-8') as f:
    logits_json = json.load(f)

In [3]:
embed = CStdLib(single=False)

In [4]:
pbar = tqdm(logits_json.items())
i = 0
for key, prod_dict in pbar:
    optional_tags = prod_dict['optional_tags']
    num_class = len(optional_tags)
    imgs_tags = prod_dict['imgs_tags']
    imgs_logits = []
    for i in range(len(imgs_tags)):
        imgs_logits.append(np.fromstring(imgs_tags[i][key + '_{}.jpg'.format(i)], dtype=np.float32, sep=' '))
    imgs_logits = np.array(imgs_logits)
    
    # 聚类
    cluster = KMeans(n_clusters=num_class, random_state=42).fit(imgs_logits)
    labels = cluster.labels_
    
    # 计算匹配度
    match_score = np.zeros((num_class, num_class))
    # match_score[i, j] 表示 kmeans 分类结果为 i 和 optional_tags[j] 的匹配度
    embeded_tags = [embed(tag) for tag in optional_tags]
    not_matched = False
    for i in range(num_class):
        id_i = np.where(labels == i)[0]
        logits = imgs_logits[id_i]
        for j in range(num_class):
            logits_ids = embeded_tags[j]
            if len(logits_ids) > 0:
                logits_j = logits[:, logits_ids]
                match_score[i, j] = np.mean(logits_j)
            else:
                not_matched = True
                match_score[i, j] = 1
    
    # 对 scores 进行归一化，使得每一列的 ord 范数为 1；然后在每一行中找到最大值，即为该行商品对应的标签。
    # 按列归一化，按行找最大值，这样就做到了双向选择。
    # 使用较高阶的范数能增大未匹配标签对应的概率，有利于提高 acc/em。
    match_score = (np.abs(match_score) +  match_score) / 2. + 1e-7 # 只保留正数，加 1e-7 避免 warning
    match_score_norm = match_score / np.linalg.norm(match_score, axis=0, ord=3)
    # print(key, '\n', match_score_norm)
    label_id = np.argmax(match_score_norm, axis=1)
    label_score = np.max(match_score_norm, axis=1)
    # print(label_id)
    
    # 强制使每一类都被选中
    selected_label_id = []
    for i in range(num_class):
        if label_id[i] == -1:
            continue
        selected_label_id.append(label_id[i])
        for j in range(i + 1, num_class):
            if label_id[i] == label_id[j]:
                if label_score[i] > label_score[j]:
                    label_id[j] = -1
                else:
                    label_id[i] = -1
    selected_label_id = set(selected_label_id)
    unselected = list(set(range(num_class)) - selected_label_id)
    i = 0
    for j in range(num_class):
        if label_id[j] == -1:
            label_id[j] = unselected[i]
            i += 1
    
    # 终于可以写入 json 了
    for i in range(num_class):
        id_i = np.where(labels == i)[0]
        for j in id_i:
            logits_json[key]['imgs_tags'][j][key + '_{}.jpg'.format(j)] = optional_tags[label_id[i]]
    if not_matched:
        print(key, '\n', match_score, '\n', match_score_norm)
        pprint(logits_json[key])

  2%|▏         | 96/5331 [00:00<00:42, 124.55it/s]

623030476282 
 [[6.47678147e+00 1.00000010e+00]
 [1.00000000e-07 1.00000010e+00]] 
 [[1.00000000e+00 7.93700526e-01]
 [1.54397675e-08 7.93700526e-01]]
{'imgs_tags': [{'623030476282_0.jpg': '9954黑色西装外套'},
               {'623030476282_1.jpg': '9954黑色西装外套'},
               {'623030476282_2.jpg': '9953条纹衬衣'},
               {'623030476282_3.jpg': '9954黑色西装外套'},
               {'623030476282_4.jpg': '9953条纹衬衣'},
               {'623030476282_5.jpg': '9954黑色西装外套'},
               {'623030476282_6.jpg': '9954黑色西装外套'}],
 'optional_tags': ['9954黑色西装外套', '9953条纹衬衣']}


  5%|▌         | 281/5331 [00:02<01:02, 80.72it/s] 

624952669722 
 [[1.00000000e-07 1.00000010e+00]
 [4.34332113e-01 1.00000010e+00]] 
 [[2.30238559e-07 7.93700526e-01]
 [1.00000000e+00 7.93700526e-01]]
{'imgs_tags': [{'624952669722_0.jpg': '珊瑚色组合'},
               {'624952669722_1.jpg': '珊瑚色组合'},
               {'624952669722_2.jpg': '橄榄色组合'},
               {'624952669722_3.jpg': '橄榄色组合'},
               {'624952669722_4.jpg': '橄榄色组合'}],
 'optional_tags': ['珊瑚色组合', '橄榄色组合']}


  6%|▋         | 338/5331 [00:03<01:08, 72.87it/s] 

614043736149 
 [[1.14102091e-01 1.00000000e-07 1.00000000e-07 1.00000010e+00
  1.00000000e-07]
 [1.00000000e-07 1.00000000e-07 5.36092387e+00 1.00000010e+00
  1.00000000e-07]
 [1.00000000e-07 5.13085804e+00 1.00000000e-07 1.00000010e+00
  1.00000000e-07]
 [1.00000000e-07 1.00000000e-07 1.00000000e-07 1.00000010e+00
  7.67332850e+00]
 [7.36306057e+00 1.00000000e-07 1.00000000e-07 1.00000010e+00
  1.00000000e-07]] 
 [[1.54965382e-02 1.94899175e-08 1.86535012e-08 5.84803548e-01
  1.30321542e-08]
 [1.35812920e-08 1.94899175e-08 1.00000000e+00 5.84803548e-01
  1.30321542e-08]
 [1.35812920e-08 1.00000000e+00 1.86535012e-08 5.84803548e-01
  1.30321542e-08]
 [1.35812920e-08 1.94899175e-08 1.86535012e-08 5.84803548e-01
  1.00000000e+00]
 [9.99998760e-01 1.94899175e-08 1.86535012e-08 5.84803548e-01
  1.30321542e-08]]
{'imgs_tags': [{'614043736149_0.jpg': '条纹'},
               {'614043736149_1.jpg': '桔色'},
               {'614043736149_2.jpg': '红色'},
               {'614043736149_3.jpg': '桔色'},
 

 15%|█▌        | 818/5331 [00:08<00:35, 125.80it/s]

565607486988 
 [[6.4931532  1.0000001 ]
 [7.34086428 1.0000001 ]] 
 [[0.74228789 0.79370053]
 [0.839197   0.79370053]]
{'imgs_tags': [{'565607486988_0.jpg': '黑色'},
               {'565607486988_1.jpg': '条纹'},
               {'565607486988_2.jpg': '条纹'},
               {'565607486988_3.jpg': '条纹'},
               {'565607486988_4.jpg': '条纹'},
               {'565607486988_5.jpg': '条纹'},
               {'565607486988_6.jpg': '条纹'}],
 'optional_tags': ['黑色', '条纹']}


 30%|██▉       | 1589/5331 [00:17<00:30, 121.59it/s]

623845128856 
 [[4.63337104e-01 1.00000010e+00 9.81692229e+00]
 [6.22044907e+00 1.00000010e+00 1.00000000e-07]
 [9.61142104e-01 1.00000010e+00 5.95787011e+00]] 
 [[7.43845442e-02 6.93361274e-01 9.34963040e-01]
 [9.98636338e-01 6.93361274e-01 9.52399349e-09]
 [1.54302595e-01 6.93361274e-01 5.67427161e-01]]
{'imgs_tags': [{'623845128856_0.jpg': '黑色'},
               {'623845128856_1.jpg': '黑色'},
               {'623845128856_2.jpg': '条纹'},
               {'623845128856_3.jpg': '白色'},
               {'623845128856_4.jpg': '条纹'},
               {'623845128856_5.jpg': '白色'}],
 'optional_tags': ['黑色', '条纹', '白色']}
605311018824 
 [[1.00000000e-07 1.00000010e+00 4.55920802e+00]
 [5.13982020e+00 1.00000010e+00 8.81051640e-01]
 [5.99557505e+00 1.00000010e+00 1.00000000e-07]] 
 [[1.41722425e-08 6.93361274e-01 9.97605958e-01]
 [7.28427785e-01 6.93361274e-01 1.92784002e-01]
 [8.49707438e-01 6.93361274e-01 2.18811240e-08]]
{'imgs_tags': [{'605311018824_0.jpg': '条纹'},
               {'605311018824_1.

 31%|███▏      | 1679/5331 [00:18<00:30, 119.78it/s]

621875084587 
 [[5.58567105 1.0000001 ]
 [3.25660167 1.0000001 ]] 
 [[0.9415113  0.79370053]
 [0.54892729 0.79370053]]
{'imgs_tags': [{'621875084587_0.jpg': '条纹'},
               {'621875084587_1.jpg': '条纹'},
               {'621875084587_2.jpg': '条纹'},
               {'621875084587_3.jpg': '条纹'},
               {'621875084587_4.jpg': '黑色'},
               {'621875084587_5.jpg': '黑色'},
               {'621875084587_6.jpg': '黑色'}],
 'optional_tags': ['黑色', '条纹']}


 42%|████▏     | 2239/5331 [00:23<00:25, 123.04it/s]

614488576767 
 [[1.00000010e+00 5.88486348e+00]
 [1.00000010e+00 1.00000000e-07]] 
 [[7.93700526e-01 1.00000000e+00]
 [7.93700526e-01 1.69927476e-08]]
{'imgs_tags': [{'614488576767_0.jpg': '白色'},
               {'614488576767_1.jpg': '彩色'},
               {'614488576767_2.jpg': '彩色'},
               {'614488576767_3.jpg': '彩色'},
               {'614488576767_4.jpg': '白色'},
               {'614488576767_5.jpg': '彩色'},
               {'614488576767_6.jpg': '白色'}],
 'optional_tags': ['彩色', '白色']}


 43%|████▎     | 2304/5331 [00:23<00:25, 120.04it/s]

576888702197 
 [[3.07519398e+00 1.00000010e+00 1.98595102e-01 4.72557126e+00]
 [1.00000000e-07 1.00000010e+00 5.61172925e+00 1.01500712e+00]
 [1.00000000e-07 1.00000010e+00 8.07319085e-01 8.53614436e+00]
 [5.65599070e+00 1.00000010e+00 1.00000000e-07 1.88747666e+00]] 
 [[5.17353047e-01 6.29960525e-01 3.53537134e-02 5.23562102e-01]
 [1.68234281e-08 6.29960525e-01 9.98994765e-01 1.12456089e-01]
 [1.68234281e-08 6.29960525e-01 1.43718185e-01 9.45748449e-01]
 [9.51531527e-01 6.29960525e-01 1.78019060e-08 2.09119955e-01]]
{'imgs_tags': [{'576888702197_0.jpg': '黑色'},
               {'576888702197_1.jpg': '黑色'},
               {'576888702197_2.jpg': '条纹色'},
               {'576888702197_3.jpg': '白色'},
               {'576888702197_4.jpg': '灰色'},
               {'576888702197_5.jpg': '灰色'},
               {'576888702197_6.jpg': '条纹色'},
               {'576888702197_7.jpg': '黑色'},
               {'576888702197_8.jpg': '白色'}],
 'optional_tags': ['黑色', '条纹色', '灰色', '白色']}


 44%|████▍     | 2354/5331 [00:24<00:24, 119.29it/s]

621891805225 
 [[7.04167901e+00 1.00000010e+00]
 [1.00000000e-07 1.00000010e+00]] 
 [[1.00000000e+00 7.93700526e-01]
 [1.42011585e-08 7.93700526e-01]]
{'imgs_tags': [{'621891805225_0.jpg': 'Black'},
               {'621891805225_1.jpg': 'Black'},
               {'621891805225_2.jpg': 'Smoked'},
               {'621891805225_3.jpg': 'Black'},
               {'621891805225_4.jpg': 'Black'},
               {'621891805225_5.jpg': 'Black'},
               {'621891805225_6.jpg': 'Black'}],
 'optional_tags': ['Black', 'Smoked']}


 46%|████▋     | 2472/5331 [00:25<00:23, 121.30it/s]

603281751757 
 [[8.02125845 1.0000001 ]
 [6.29499636 1.0000001 ]] 
 [[0.87683745 0.79370053]
 [0.68813249 0.79370053]]
{'imgs_tags': [{'603281751757_0.jpg': '拼色'},
               {'603281751757_1.jpg': '黑色'},
               {'603281751757_2.jpg': '黑色'},
               {'603281751757_3.jpg': '拼色'},
               {'603281751757_4.jpg': '黑色'},
               {'603281751757_5.jpg': '拼色'},
               {'603281751757_6.jpg': '拼色'}],
 'optional_tags': ['黑色', '拼色']}
617278066189 
 [[1.09488092 1.0000001 ]
 [1.70013211 1.0000001 ]] 
 [[0.59513489 0.79370053]
 [0.92412601 0.79370053]]
{'imgs_tags': [{'617278066189_0.jpg': '条纹套装'},
               {'617278066189_1.jpg': '白色套装'},
               {'617278066189_2.jpg': '条纹套装'},
               {'617278066189_3.jpg': '条纹套装'},
               {'617278066189_4.jpg': '条纹套装'},
               {'617278066189_5.jpg': '条纹套装'}],
 'optional_tags': ['白色套装', '条纹套装']}


 53%|█████▎    | 2824/5331 [00:28<00:21, 118.43it/s]

617521964984 
 [[1.00000000e-07 2.18213306e+00 1.00000010e+00 2.22292099e+00]
 [5.36240731e+00 1.00000000e-07 1.00000010e+00 1.00000000e-07]
 [8.58951019e-02 1.93567715e+00 1.00000010e+00 4.81650792e+00]
 [1.53612409e+00 1.00459514e+00 1.00000010e+00 1.26407705e+00]] 
 [[1.85044422e-08 8.22745832e-01 6.29960525e-01 4.44891558e-01]
 [9.92283558e-01 3.77037426e-08 6.29960525e-01 2.00138268e-08]
 [1.58944094e-02 7.29822731e-01 6.29960525e-01 9.63967553e-01]
 [2.84251194e-01 3.78769966e-01 6.29960525e-01 2.52990191e-01]]
{'imgs_tags': [{'617521964984_0.jpg': '灰色'},
               {'617521964984_1.jpg': '黑色'},
               {'617521964984_2.jpg': '黑色'},
               {'617521964984_3.jpg': '白色'},
               {'617521964984_4.jpg': '条纹'},
               {'617521964984_5.jpg': '白色'},
               {'617521964984_6.jpg': '灰色'},
               {'617521964984_7.jpg': '条纹'}],
 'optional_tags': ['黑色', '灰色', '条纹', '白色']}


 58%|█████▊    | 3088/5331 [00:30<00:19, 114.36it/s]

614491314779 
 [[4.92362509 1.0000001 ]
 [5.61240969 1.0000001 ]] 
 [[0.73866949 0.79370053]
 [0.84200477 0.79370053]]
{'imgs_tags': [{'614491314779_0.jpg': '竖条纹'},
               {'614491314779_1.jpg': '黑白点'},
               {'614491314779_2.jpg': '竖条纹'},
               {'614491314779_3.jpg': '黑白点'},
               {'614491314779_4.jpg': '黑白点'},
               {'614491314779_5.jpg': '黑白点'},
               {'614491314779_6.jpg': '竖条纹'}],
 'optional_tags': ['黑白点', '竖条纹']}


 60%|██████    | 3213/5331 [00:31<00:16, 124.66it/s]

621909719023 
 [[1.00000000e-07 1.00000010e+00 1.00000000e-07]
 [1.00000000e-07 1.00000010e+00 5.48934708e+00]
 [7.12543116e+00 1.00000010e+00 1.00000000e-07]] 
 [[1.40342385e-08 6.93361274e-01 1.82171028e-08]
 [1.40342385e-08 6.93361274e-01 1.00000000e+00]
 [1.00000000e+00 6.93361274e-01 1.82171028e-08]]
{'imgs_tags': [{'621909719023_0.jpg': '红色（斜裁吊带裙）'},
               {'621909719023_1.jpg': '浅蓝（斜裁吊带裙）'},
               {'621909719023_2.jpg': '浅蓝（斜裁吊带裙）'},
               {'621909719023_3.jpg': '浅蓝（斜裁吊带裙）'},
               {'621909719023_4.jpg': '红色（斜裁吊带裙）'},
               {'621909719023_5.jpg': '奶茶（斜裁吊带裙）'},
               {'621909719023_6.jpg': '奶茶（斜裁吊带裙）'},
               {'621909719023_7.jpg': '浅蓝（斜裁吊带裙）'}],
 'optional_tags': ['红色（斜裁吊带裙）', '奶茶（斜裁吊带裙）', '浅蓝（斜裁吊带裙）']}


 64%|██████▍   | 3435/5331 [00:33<00:16, 115.45it/s]

621931894053 
 [[1.0000001  0.4594566  3.9506465 ]
 [1.0000001  5.68938933 0.58009313]
 [1.0000001  2.11243926 4.90918122]] 
 [[0.69336127 0.07941083 0.69948192]
 [0.69336127 0.98333363 0.10270842]
 [0.69336127 0.36510642 0.86919534]]
{'imgs_tags': [{'621931894053_0.jpg': '白色半裙'},
               {'621931894053_1.jpg': '白色半裙'},
               {'621931894053_2.jpg': '条纹衬衫'},
               {'621931894053_3.jpg': '黑色半裙'},
               {'621931894053_4.jpg': '条纹衬衫'},
               {'621931894053_5.jpg': '白色半裙'},
               {'621931894053_6.jpg': '黑色半裙'}],
 'optional_tags': ['条纹衬衫', '黑色半裙', '白色半裙']}


 70%|██████▉   | 3728/5331 [00:35<00:12, 123.58it/s]

601450019157 
 [[1.0000001]] 
 [[1.]]
{'imgs_tags': [{'601450019157_0.jpg': '1906纯色'},
               {'601450019157_1.jpg': '1906纯色'},
               {'601450019157_2.jpg': '1906纯色'}],
 'optional_tags': ['1906纯色']}


 71%|███████   | 3781/5331 [00:36<00:12, 125.31it/s]

618434936298 
 [[1.00000010e+00 4.24226389e+00]
 [1.00000010e+00 1.00000000e-07]] 
 [[7.93700526e-01 1.00000000e+00]
 [7.93700526e-01 2.35723195e-08]]
{'imgs_tags': [{'618434936298_0.jpg': '090粉红色'},
               {'618434936298_1.jpg': '990混色'},
               {'618434936298_2.jpg': '990混色'},
               {'618434936298_3.jpg': '090粉红色'},
               {'618434936298_4.jpg': '090粉红色'},
               {'618434936298_5.jpg': '090粉红色'},
               {'618434936298_6.jpg': '090粉红色'}],
 'optional_tags': ['990混色', '090粉红色']}


 82%|████████▏ | 4376/5331 [00:41<00:08, 118.40it/s]

625820335127 
 [[4.74958382e+00 1.00000010e+00 1.38407479e+00]
 [6.14199440e-01 1.00000010e+00 5.44424067e+00]
 [6.98068057e+00 1.00000010e+00 1.00000000e-07]] 
 [[6.20931762e-01 6.93361274e-01 2.52849982e-01]
 [8.02967071e-02 6.93361274e-01 9.94582205e-01]
 [9.12611809e-01 6.93361274e-01 1.82685202e-08]]
{'imgs_tags': [{'625820335127_0.jpg': '白色'},
               {'625820335127_1.jpg': '白色'},
               {'625820335127_2.jpg': '条纹'},
               {'625820335127_3.jpg': '黑色'},
               {'625820335127_4.jpg': '条纹'},
               {'625820335127_5.jpg': '白色'},
               {'625820335127_6.jpg': '条纹'},
               {'625820335127_7.jpg': '黑色'}],
 'optional_tags': ['黑色', '条纹', '白色']}


 85%|████████▌ | 4533/5331 [00:42<00:06, 119.98it/s]

577541680180 
 [[9.96545802e+00 1.00000010e+00 2.70467554e+00 1.00000000e-07]
 [1.00000000e-07 1.00000010e+00 1.00000000e-07 8.64553462e+00]
 [1.08791361e+00 1.00000010e+00 7.65686665e+00 1.00000000e-07]
 [1.00000000e-07 1.00000010e+00 1.00000000e-07 3.29344402e+00]] 
 [[9.99566695e-01 6.29960525e-01 3.48193124e-01 1.13610582e-08]
 [1.00303136e-08 6.29960525e-01 1.28737484e-08 9.82224217e-01]
 [1.09121148e-01 6.29960525e-01 9.85725750e-01 1.13610582e-08]
 [1.00303136e-08 6.29960525e-01 1.28737484e-08 3.74170091e-01]]
{'imgs_tags': [{'577541680180_0.jpg': '红色'},
               {'577541680180_1.jpg': '红色'},
               {'577541680180_2.jpg': '蓝色'},
               {'577541680180_3.jpg': '蓝色'},
               {'577541680180_4.jpg': '橙色'},
               {'577541680180_5.jpg': '深色'},
               {'577541680180_6.jpg': '橙色'},
               {'577541680180_7.jpg': '蓝色'},
               {'577541680180_8.jpg': '蓝色'}],
 'optional_tags': ['橙色', '深色', '红色', '蓝色']}


 90%|████████▉ | 4780/5331 [00:44<00:04, 120.03it/s]

612126042525 
 [[1.0000001  6.93167267]
 [1.0000001  3.33070503]] 
 [[0.79370053 0.96553849]
 [0.79370053 0.4639463 ]]
{'imgs_tags': [{'612126042525_0.jpg': '957黑色'},
               {'612126042525_1.jpg': '957黑色'},
               {'612126042525_2.jpg': '957黑色'},
               {'612126042525_3.jpg': '6802格子'},
               {'612126042525_4.jpg': '957黑色'},
               {'612126042525_5.jpg': '957黑色'},
               {'612126042525_6.jpg': '957黑色'}],
 'optional_tags': ['6802格子', '957黑色']}


100%|██████████| 5331/5331 [00:49<00:00, 107.62it/s]


In [5]:
with open('./result_labels.json', 'w') as f:
    json.dump(logits_json, f, indent=4, ensure_ascii=False)