In [11]:
import json
import numpy as np
from tqdm import tqdm
from pprint import pprint
from sklearn.cluster import KMeans

from label_processor import CStdLib

In [12]:
with open('result_logits.json', encoding='utf-8') as f:
    logits_json = json.load(f)

In [13]:
embed = CStdLib(single=False)

In [14]:
pbar = tqdm(logits_json.items())
i = 0
for key, prod_dict in pbar:
    optional_tags = prod_dict['optional_tags']
    num_class = len(optional_tags)
    imgs_tags = prod_dict['imgs_tags']
    imgs_logits = []
    for i in range(len(imgs_tags)):
        imgs_logits.append(np.fromstring(imgs_tags[i][key + '_{}.jpg'.format(i)], dtype=np.float32, sep=' '))
    imgs_logits = np.array(imgs_logits)
    
    # 聚类
    cluster = KMeans(n_clusters=num_class, random_state=42).fit(imgs_logits)
    labels = cluster.labels_
    
    # 计算匹配度
    match_score = np.zeros((num_class, num_class))
    # match_score[i, j] 表示 kmeans 分类结果为 i 和 optional_tags[j] 的匹配度
    embeded_tags = [embed(tag) for tag in optional_tags]
    not_matched = False
    for i in range(num_class):
        id_i = np.where(labels == i)[0]
        logits = imgs_logits[id_i]
        for j in range(num_class):
            logits_ids = embeded_tags[j]
            if len(logits_ids) > 0:
                logits_j = logits[:, logits_ids]
                match_score[i, j] = np.mean(logits_j)
            else:
                not_matched = True
                match_score[i, j] = 1
    
    # 对 scores 进行归一化，使得每一列的 ord 范数为 1；然后在每一行中找到最大值，即为该行商品对应的标签。
    # 按列归一化，按行找最大值，这样就做到了双向选择。
    # 使用较高阶的范数能增大未匹配标签对应的概率，有利于提高 acc/em。
    match_score_norm = match_score / np.linalg.norm(match_score, axis=0, ord=3)
    # print(key, '\n', match_score_norm)
    label_id = np.argmax(match_score_norm, axis=1)
    label_score = np.max(match_score_norm, axis=1)
    # print(label_id)
    
    # 强制使每一类都被选中
    selected_label_id = []
    for i in range(num_class):
        if label_id[i] == -1:
            continue
        selected_label_id.append(label_id[i])
        for j in range(i + 1, num_class):
            if label_id[i] == label_id[j]:
                if label_score[i] > label_score[j]:
                    label_id[j] = -1
                else:
                    label_id[i] = -1
    selected_label_id = set(selected_label_id)
    unselected = list(set(range(num_class)) - selected_label_id)
    i = 0
    for j in range(num_class):
        if label_id[j] == -1:
            label_id[j] = unselected[i]
            i += 1
    
    # 终于可以写入 json 了
    for i in range(num_class):
        id_i = np.where(labels == i)[0]
        for j in id_i:
            logits_json[key]['imgs_tags'][j][key + '_{}.jpg'.format(j)] = optional_tags[label_id[i]]
    if not_matched:
        print(key, '\n', match_score, '\n', match_score_norm)
        pprint(logits_json[key])

  5%|▌         | 280/5331 [00:02<00:50, 100.60it/s]

624952669722 
 [[3.58562636 1.        ]
 [5.74702406 1.        ]] 
 [[0.58029295 0.79370053]
 [0.93009064 0.79370053]]
{'imgs_tags': [{'624952669722_0.jpg': '珊瑚色组合'},
               {'624952669722_1.jpg': '珊瑚色组合'},
               {'624952669722_2.jpg': '橄榄色组合'},
               {'624952669722_3.jpg': '橄榄色组合'},
               {'624952669722_4.jpg': '橄榄色组合'}],
 'optional_tags': ['珊瑚色组合', '橄榄色组合']}


  7%|▋         | 349/5331 [00:03<00:50, 97.89it/s] 

614043736149 
 [[ 5.96085691  5.81517696  2.98464155  1.          6.27514553]
 [ 3.44056153  3.20435333 11.78937626  1.          3.06254959]
 [ 3.48982143 11.48388481  4.44730949  1.          3.26497936]
 [12.58054924  2.30453396  2.99636054  1.          2.12128043]
 [ 3.87575579  2.04849195  1.98173594  1.         13.41447067]] 
 [[0.44871058 0.48119926 0.24591403 0.58480355 0.44876604]
 [0.25899235 0.26515658 0.97136387 0.58480355 0.21901775]
 [0.26270045 0.95027836 0.36642785 0.58480355 0.23349449]
 [0.94701577 0.19069755 0.24687959 0.58480355 0.15170304]
 [0.29175211 0.16951037 0.16328147 0.58480355 0.95933376]]
{'imgs_tags': [{'614043736149_0.jpg': '条纹'},
               {'614043736149_1.jpg': '桔色'},
               {'614043736149_2.jpg': '红色'},
               {'614043736149_3.jpg': '桔色'},
               {'614043736149_4.jpg': '红色'},
               {'614043736149_5.jpg': '黑色'},
               {'614043736149_6.jpg': '条纹'},
               {'614043736149_7.jpg': '黑色'},
               {

 15%|█▌        | 814/5331 [00:07<00:49, 92.17it/s] 

565607486988 
 [[13.45341301  1.        ]
 [12.23390961  1.        ]] 
 [[0.82951597 0.79370053]
 [0.75432334 0.79370053]]
{'imgs_tags': [{'565607486988_0.jpg': '条纹'},
               {'565607486988_1.jpg': '条纹'},
               {'565607486988_2.jpg': '条纹'},
               {'565607486988_3.jpg': '黑色'},
               {'565607486988_4.jpg': '黑色'},
               {'565607486988_5.jpg': '黑色'},
               {'565607486988_6.jpg': '黑色'}],
 'optional_tags': ['黑色', '条纹']}


 30%|██▉       | 1596/5331 [00:15<00:31, 117.23it/s]

623845128856 
 [[ 5.17911816  1.         15.68574524]
 [ 9.61451912  1.          3.86540008]
 [ 7.51175499  1.         14.71330643]] 
 [[0.45741765 0.69336127 0.81602984]
 [0.8491505  0.69336127 0.20109225]
 [0.66343521 0.69336127 0.76544002]]
{'imgs_tags': [{'623845128856_0.jpg': '黑色'},
               {'623845128856_1.jpg': '黑色'},
               {'623845128856_2.jpg': '条纹'},
               {'623845128856_3.jpg': '白色'},
               {'623845128856_4.jpg': '条纹'},
               {'623845128856_5.jpg': '白色'}],
 'optional_tags': ['黑色', '条纹', '白色']}
605311018824 
 [[11.22296333  1.          6.2164464 ]
 [10.44147778  1.          4.11536407]
 [ 2.54653502  1.          9.91519547]] 
 [[0.81949998 0.69336127 0.57184024]
 [0.76243596 0.69336127 0.37856528]
 [0.18594781 0.69336127 0.91208182]]
{'imgs_tags': [{'605311018824_0.jpg': '黑色'},
               {'605311018824_1.jpg': '蓝色'},
               {'605311018824_2.jpg': '黑色'},
               {'605311018824_3.jpg': '条纹'},
               {'605311

 31%|███▏      | 1673/5331 [00:15<00:29, 124.17it/s]

621875084587 
 [[13.52489948  1.        ]
 [11.87785721  1.        ]] 
 [[0.84163845 0.79370053]
 [0.73914496 0.79370053]]
{'imgs_tags': [{'621875084587_0.jpg': '条纹'},
               {'621875084587_1.jpg': '条纹'},
               {'621875084587_2.jpg': '条纹'},
               {'621875084587_3.jpg': '条纹'},
               {'621875084587_4.jpg': '黑色'},
               {'621875084587_5.jpg': '黑色'},
               {'621875084587_6.jpg': '黑色'}],
 'optional_tags': ['黑色', '条纹']}


 43%|████▎     | 2292/5331 [00:21<00:33, 91.59it/s] 

576888702197 
 [[ 7.64519119  1.          5.14920902 12.56099319]
 [ 4.96863699  1.         13.72055721  8.55056953]
 [12.60352802  1.          6.035532    8.46185589]
 [ 5.63134909  1.          7.43784094 15.27624798]] 
 [[0.54567742 0.62996052 0.34410482 0.66373374]
 [0.3546377  0.62996052 0.91690003 0.45181949]
 [0.89957995 0.62996052 0.40333489 0.44713178]
 [0.40193894 0.62996052 0.49704662 0.80721015]]
{'imgs_tags': [{'576888702197_0.jpg': '黑色'},
               {'576888702197_1.jpg': '黑色'},
               {'576888702197_2.jpg': '条纹色'},
               {'576888702197_3.jpg': '白色'},
               {'576888702197_4.jpg': '灰色'},
               {'576888702197_5.jpg': '灰色'},
               {'576888702197_6.jpg': '条纹色'},
               {'576888702197_7.jpg': '黑色'},
               {'576888702197_8.jpg': '白色'}],
 'optional_tags': ['黑色', '条纹色', '灰色', '白色']}


 47%|████▋     | 2479/5331 [00:23<00:28, 101.06it/s]

603281751757 
 [[12.63451099  1.        ]
 [13.82193756  1.        ]] 
 [[0.75655645 0.79370053]
 [0.82765973 0.79370053]]
{'imgs_tags': [{'603281751757_0.jpg': '黑色'},
               {'603281751757_1.jpg': '黑色'},
               {'603281751757_2.jpg': '黑色'},
               {'603281751757_3.jpg': '拼色'},
               {'603281751757_4.jpg': '黑色'},
               {'603281751757_5.jpg': '拼色'},
               {'603281751757_6.jpg': '拼色'}],
 'optional_tags': ['黑色', '拼色']}
617278066189 
 [[6.88708878 1.        ]
 [6.63985157 1.        ]] 
 [[0.80793749 0.79370053]
 [0.77893362 0.79370053]]
{'imgs_tags': [{'617278066189_0.jpg': '条纹套装'},
               {'617278066189_1.jpg': '条纹套装'},
               {'617278066189_2.jpg': '条纹套装'},
               {'617278066189_3.jpg': '白色套装'},
               {'617278066189_4.jpg': '白色套装'},
               {'617278066189_5.jpg': '条纹套装'}],
 'optional_tags': ['白色套装', '条纹套装']}


 53%|█████▎    | 2820/5331 [00:26<00:24, 104.05it/s]

617521964984 
 [[ 3.97247601 11.68997765  1.          6.8406148 ]
 [13.0303421   4.28722477  1.          3.9030509 ]
 [ 4.40561724  8.7975359   1.          9.87052345]
 [ 8.59153461 11.75207329  1.          7.53992605]] 
 [[0.2755955  0.73763576 0.62996052 0.56552066]
 [0.90399632 0.27052321 0.62996052 0.32266923]
 [0.30564522 0.55512314 0.62996052 0.81600632]
 [0.59604849 0.74155398 0.62996052 0.62333344]]
{'imgs_tags': [{'617521964984_0.jpg': '条纹'},
               {'617521964984_1.jpg': '黑色'},
               {'617521964984_2.jpg': '黑色'},
               {'617521964984_3.jpg': '白色'},
               {'617521964984_4.jpg': '灰色'},
               {'617521964984_5.jpg': '白色'},
               {'617521964984_6.jpg': '条纹'},
               {'617521964984_7.jpg': '灰色'}],
 'optional_tags': ['黑色', '灰色', '条纹', '白色']}


 58%|█████▊    | 3089/5331 [00:29<00:21, 103.47it/s]

614491314779 
 [[14.27825642  1.        ]
 [14.17397881  1.        ]] 
 [[0.79659874 0.79370053]
 [0.79078099 0.79370053]]
{'imgs_tags': [{'614491314779_0.jpg': '黑白点'},
               {'614491314779_1.jpg': '竖条纹'},
               {'614491314779_2.jpg': '黑白点'},
               {'614491314779_3.jpg': '竖条纹'},
               {'614491314779_4.jpg': '竖条纹'},
               {'614491314779_5.jpg': '竖条纹'},
               {'614491314779_6.jpg': '黑白点'}],
 'optional_tags': ['黑白点', '竖条纹']}


 64%|██████▍   | 3438/5331 [00:33<00:18, 104.21it/s]

621931894053 
 [[ 1.         12.72706032  8.73714638]
 [ 1.          7.30259895 12.93735695]
 [ 1.          7.26740885  8.57313538]] 
 [[0.69336127 0.89926766 0.57752905]
 [0.69336127 0.51598648 0.85516473]
 [0.69336127 0.51350002 0.56668785]]
{'imgs_tags': [{'621931894053_0.jpg': '白色半裙'},
               {'621931894053_1.jpg': '白色半裙'},
               {'621931894053_2.jpg': '白色半裙'},
               {'621931894053_3.jpg': '黑色半裙'},
               {'621931894053_4.jpg': '条纹衬衫'},
               {'621931894053_5.jpg': '白色半裙'},
               {'621931894053_6.jpg': '黑色半裙'}],
 'optional_tags': ['条纹衬衫', '黑色半裙', '白色半裙']}


 70%|██████▉   | 3719/5331 [00:35<00:13, 120.47it/s]

601450019157 
 [[1.]] 
 [[1.]]
{'imgs_tags': [{'601450019157_0.jpg': '1906纯色'},
               {'601450019157_1.jpg': '1906纯色'},
               {'601450019157_2.jpg': '1906纯色'}],
 'optional_tags': ['1906纯色']}


 82%|████████▏ | 4374/5331 [00:41<00:07, 126.92it/s]

625820335127 
 [[11.50905609  1.          8.86293125]
 [ 6.76889086  1.         13.53934002]
 [13.28006554  1.          7.0369153 ]] 
 [[0.71465416 0.69336127 0.58227125]
 [0.42031388 0.69336127 0.889499  ]
 [0.82462489 0.69336127 0.46230681]]
{'imgs_tags': [{'625820335127_0.jpg': '白色'},
               {'625820335127_1.jpg': '白色'},
               {'625820335127_2.jpg': '条纹'},
               {'625820335127_3.jpg': '黑色'},
               {'625820335127_4.jpg': '条纹'},
               {'625820335127_5.jpg': '白色'},
               {'625820335127_6.jpg': '条纹'},
               {'625820335127_7.jpg': '黑色'}],
 'optional_tags': ['黑色', '条纹', '白色']}


 85%|████████▍ | 4526/5331 [00:42<00:06, 116.79it/s]

577541680180 
 [[15.40407181  1.          7.24776173  1.84761405]
 [ 1.82517242  1.          2.26344609 11.71285343]
 [ 3.90824056  1.         12.36336899  3.2937541 ]
 [ 2.89369607  1.          4.93059397 10.03467846]] 
 [[0.99192408 0.62996052 0.54118827 0.13335765]
 [0.11752948 0.62996052 0.16901087 0.84541389]
 [0.25166579 0.62996052 0.92316919 0.23773758]
 [0.18633559 0.62996052 0.36816603 0.72428607]]
{'imgs_tags': [{'577541680180_0.jpg': '红色'},
               {'577541680180_1.jpg': '红色'},
               {'577541680180_2.jpg': '蓝色'},
               {'577541680180_3.jpg': '蓝色'},
               {'577541680180_4.jpg': '橙色'},
               {'577541680180_5.jpg': '深色'},
               {'577541680180_6.jpg': '橙色'},
               {'577541680180_7.jpg': '蓝色'},
               {'577541680180_8.jpg': '蓝色'}],
 'optional_tags': ['橙色', '深色', '红色', '蓝色']}


 90%|████████▉ | 4786/5331 [00:44<00:04, 120.51it/s]

612126042525 
 [[ 1.         13.88354111]
 [ 1.         12.36282253]] 
 [[0.79370053 0.83688756]
 [0.79370053 0.74521999]]
{'imgs_tags': [{'612126042525_0.jpg': '957黑色'},
               {'612126042525_1.jpg': '957黑色'},
               {'612126042525_2.jpg': '957黑色'},
               {'612126042525_3.jpg': '6802格子'},
               {'612126042525_4.jpg': '957黑色'},
               {'612126042525_5.jpg': '957黑色'},
               {'612126042525_6.jpg': '957黑色'}],
 'optional_tags': ['6802格子', '957黑色']}


100%|██████████| 5331/5331 [00:49<00:00, 108.33it/s]


In [15]:
with open('./result_labels.json', 'w') as f:
    json.dump(logits_json, f, indent=4, ensure_ascii=False)