In [137]:
import pandas as pd
import numpy as np
import base64
from tqdm import tqdm
import warnings
from sklearn.externals import joblib
import torch
from torch.autograd import Variable
import itertools
import time
import argparse
import re
warnings.filterwarnings('ignore')

## train samples

In [2]:
train_sample = pd.read_csv('/home/poac/AnomalyDetectionDataset/kdd_cup_2020/train.sample.tsv', sep='\t')
train_sample.head()

Unnamed: 0,product_id,image_h,image_w,num_boxes,boxes,features,class_labels,query,query_id
0,102851856,338,209,2,AAAAAAAAQEAAAKlDAABPQwAAb0MAALBBAICmQwAATEM=,AAAAAAAAAAAnNsFAAAAAAOeVGD0AAAAAAAAAAAAAAAAAAA...,HAAAAAAAAAAcAAAAAAAAAA==,treble popular reed,923271
1,101783080,80,61,1,AACgQAAAoEAAAJZCAABgQg==,AAAAADnKfjyatb8/AAAAALxPBz0MH6E9xwkoPAAAAAAAAA...,HAAAAAAAAAA=,compatible ink cartridges,234031
2,100492797,750,750,1,AAAsQgCAg0MAgCREAADqQw==,AAAAAAAAAABmUlE/AAAAABvc0jwAAAAAAAAAAAAAAAAAAA...,BwAAAAAAAAA=,check new look facial cream,199926
3,101708088,480,640,5,AAB6QwCA0kMAANBDAMAeRAAAfEMAAKBAAIDWQwAAVkMAAH...,AAAAAAAAAAAAAAAAAAAAAAXyfz4AAAAAAAAAAAAAAAAAAA...,HAAAAAAAAAAcAAAAAAAAABwAAAAAAAAAHAAAAAAAAAAcAA...,royal commemorative stamp,772725
4,100316842,800,800,4,AAAKQwCAgEMAADNEAAAHRAAAIkMAwCZEAACqQwDARkQAAE...,AAAAADSdaTwVTdA8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,HAAAAAAAAAAGAAAAAAAAAAYAAAAAAAAAHAAAAAAAAAA=,calendula baby shower gel,362576


In [3]:
# 处理boxes
boxes = np.array(train_sample['boxes'])
for i in tqdm(range(boxes.shape[0])):
    tem = boxes[i]
    train_sample['boxes'][i] = np.frombuffer(base64.b64decode(tem), dtype=np.float32).reshape(train_sample['num_boxes'][i], 4)

100%|██████████| 10000/10000 [03:17<00:00, 50.59it/s]


In [5]:
# 处理features
features = np.array(train_sample['features'])
for i in tqdm(range(boxes.shape[0])):
    train_sample['features'][i] = np.frombuffer(base64.b64decode(features[i]), dtype=np.float32).reshape(train_sample['num_boxes'][i], 2048)

100%|██████████| 10000/10000 [03:18<00:00, 50.35it/s]


In [8]:
# 处理class_labels
class_labels = np.array(train_sample['class_labels'])
for i in tqdm(range(class_labels.shape[0])):
    train_sample['class_labels'][i] = np.frombuffer(base64.b64decode(class_labels[i]), dtype=np.int64).reshape(train_sample['num_boxes'][i])

100%|██████████| 10000/10000 [03:30<00:00, 47.42it/s]


In [10]:
train_sample.head()

Unnamed: 0,product_id,image_h,image_w,num_boxes,boxes,features,class_labels,query,query_id
0,102851856,338,209,2,"[[0.0, 3.0, 338.0, 207.0], [239.0, 22.0, 333.0...","[[0.0, 0.0, 6.0378604, 0.0, 0.037252333, 0.0, ...","[28, 28]",treble popular reed,923271
1,101783080,80,61,1,"[[5.0, 5.0, 75.0, 56.0]]","[[0.0, 0.015551143, 1.4977295, 0.0, 0.03303502...",[28],compatible ink cartridges,234031
2,100492797,750,750,1,"[[43.0, 263.0, 658.0, 468.0]]","[[0.0, 0.0, 0.81766355, 0.0, 0.02573972, 0.0, ...",[7],check new look facial cream,199926
3,101708088,480,640,5,"[[250.0, 421.0, 416.0, 635.0], [252.0, 5.0, 42...","[[0.0, 0.0, 0.0, 0.0, 0.24994667, 0.0, 0.0, 0....","[28, 28, 28, 28, 28]",royal commemorative stamp,772725
4,100316842,800,800,4,"[[138.0, 257.0, 716.0, 540.0], [162.0, 667.0, ...","[[0.0, 0.014258672, 0.02542738, 0.0, 0.0, 0.0,...","[28, 6, 6, 28]",calendula baby shower gel,362576


In [None]:
# save
joblib.dump(train_sample, '/home/poac/AnomalyDetectionDataset/kdd_cup_2020/processed/train_sample_processed.jl.z')

## test A

In [2]:
testA = pd.read_csv('/home/poac/AnomalyDetectionDataset/kdd_cup_2020/testA.tsv', sep='\t')
testA.head()

Unnamed: 0,product_id,image_h,image_w,num_boxes,boxes,features,class_labels,query,query_id
0,103052445,798,798,1,AIC3QwAAF0MAABxEAMAtRA==,AAAAAAAAAABwKQE9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,BAAAAAAAAAA=,straw heavy-bottomed sandals,0
1,103044513,800,800,1,AABqQwAAzEIAADJEAMAgRA==,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,BAAAAAAAAAA=,straw heavy-bottomed sandals,0
2,103035030,800,800,8,AAAAQwAAAEEAgNRDAICGQwAAUEEAAPBBAACUQgAA3EIAAI...,AAAAAAAAAABFkBg+AAAAAM5daTwAAAAAAAAAAAAAAAAAAA...,AAAAAAAAAAAFAAAAAAAAAAIAAAAAAAAAAwAAAAAAAAAEAA...,straw heavy-bottomed sandals,0
3,103036197,800,800,3,AAAmQwAAwEEAwC1EAICqQwAAJ0MAAFJDAAA1RADAAkQAAE...,AAAAAAAAAAD2rhE+AAAAAAAAAADbLAw+AAAAANGzLD4AAA...,BAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAA,straw heavy-bottomed sandals,0
4,103055323,800,800,1,AABUQwAA+EEAgChEAEBCRA==,AAAAAKzRGTs0osc+kJpVPAAAAAAAAAAAAAAAAAAAAAAAAA...,BAAAAAAAAAA=,straw heavy-bottomed sandals,0


In [3]:
# 处理boxes
boxes = np.array(testA['boxes'])
for i in tqdm(range(boxes.shape[0])):
    tem = boxes[i]
    testA['boxes'][i] = np.frombuffer(base64.b64decode(tem), dtype=np.float32).reshape(testA['num_boxes'][i], 4)

100%|██████████| 28830/28830 [13:16<00:00, 36.20it/s]


In [4]:
# 处理features
features = np.array(testA['features'])
for i in tqdm(range(boxes.shape[0])):
    testA['features'][i] = np.frombuffer(base64.b64decode(features[i]), dtype=np.float32).reshape(testA['num_boxes'][i], 2048)

100%|██████████| 28830/28830 [13:36<00:00, 35.31it/s]


In [5]:
# 处理class_labels
class_labels = np.array(testA['class_labels'])
for i in tqdm(range(class_labels.shape[0])):
    testA['class_labels'][i] = np.frombuffer(base64.b64decode(class_labels[i]), dtype=np.int64).reshape(testA['num_boxes'][i])

100%|██████████| 28830/28830 [12:39<00:00, 39.34it/s]


In [6]:
testA.head()

Unnamed: 0,product_id,image_h,image_w,num_boxes,boxes,features,class_labels,query,query_id
0,103052445,798,798,1,"[[367.0, 151.0, 624.0, 695.0]]","[[0.0, 0.0, 0.03153366, 0.0, 0.0, 0.0, 0.0, 0....",[4],straw heavy-bottomed sandals,0
1,103044513,800,800,1,"[[234.0, 102.0, 712.0, 643.0]]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[4],straw heavy-bottomed sandals,0
2,103035030,800,800,8,"[[128.0, 8.0, 425.0, 269.0], [13.0, 30.0, 74.0...","[[0.0, 0.0, 0.14898784, 0.0, 0.014243556, 0.0,...","[0, 5, 2, 3, 4, 4, 4, 5]",straw heavy-bottomed sandals,0
3,103036197,800,800,3,"[[166.0, 24.0, 695.0, 341.0], [167.0, 210.0, 7...","[[0.0, 0.0, 0.14226899, 0.0, 0.0, 0.13688986, ...","[4, 4, 4]",straw heavy-bottomed sandals,0
4,103055323,800,800,1,"[[212.0, 31.0, 674.0, 777.0]]","[[0.0, 0.0023470921, 0.3899094, 0.013037339, 0...",[4],straw heavy-bottomed sandals,0


In [7]:
# save
joblib.dump(testA, '/home/poac/AnomalyDetectionDataset/kdd_cup_2020/processed/testA_processed.jl.z')

['/home/poac/AnomalyDetectionDataset/kdd_cup_2020/processed/testA_processed.m']

## query

In [191]:
data = joblib.load('/home/poac/AnomalyDetectionDataset/kdd_cup_2020/processed/train_sample_processed.jl.z')

In [45]:
query = np.array(data['query'])
query.shape

(10000,)

In [46]:
def split_query(query):
    """
    split query
    :param query:
    :return: query and max len
    """
    out = []
    query_len = []
    for i in range(query.shape[0]):
        tem = str(query[i]).split(" ")
        query_len.append(len(tem))
        out.append(tem)
    out = np.array(out)
    query_len = np.array(query_len)
    return out, max(query_len)

In [47]:
query, max_len = split_query(query=query)

In [85]:
max_len

10

In [48]:
# word --> index
with open('/home/poac/code/Multi_modal_Retrieval/experiments/pretrained_models/bert-base-uncased-vocab.txt', mode="r", encoding="utf8") as f:
    lines = f.readlines()
    lines = [l.strip() for l in lines]
word_index = {v: k  for k, v in enumerate(lines)}

In [77]:
word_index['new']

2047

In [69]:
query

array([list(['treble', 'popular', 'reed']),
       list(['compatible', 'ink', 'cartridges']),
       list(['check', 'new', 'look', 'facial', 'cream']), ...,
       list(['silicone', 'accessories', 'storage', 'box']),
       list(['applicable', 'motor']), list(['restaurant', 'colander'])],
      dtype=object)

In [82]:
# 获取句子长度列表
len_list = []
for i in range(len(query)):
    len_list.append(len(query[i]))
len_list = np.array(len_list)
# 最长句子
max_len = max(len_list)
# 将每个句子中的单词变成索引值，并补齐长度
query_index = []
unsee = 0
for i in range(len(query)):
    tem = query[i]
    tem_ = []
    for j in range(len(tem)):
        if tem[j] not in word_index:
            tem_.append(0)
            unsee += 1
        else:
            tem_.append(word_index[tem[j]])
    # 补0
    if len_list[i] < max_len:
        for k in range(max_len-len_list[i]):
            tem_.append(0)
    tem_ = np.array(tem_).reshape(1, max_len)
    query_index.append(tem_)
query_index = np.array(query_index).reshape(-1, max_len)
print('unsee:', unsee)

unsee: 4737


In [278]:
query_index.shape

10

In [87]:
embedding = torch.nn.Embedding(len(word_index), 100, padding_idx=0)

In [89]:
query_index = Variable(torch.LongTensor(query_index))
query_emb = embedding(query_index)

In [92]:
query_emb[0, 0, :]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], grad_fn=<SelectBackward>)

In [228]:
label = data['class_labels']

In [229]:
def get_label(path):
    with open(path) as f:
        lines = f.readlines()
        label2id = {l.split('\n')[0].split('\t')[1]:int(l.split('\n')[0].split('\t')[0]) for l in lines[1:]}
        id2label = {int(l.split('\n')[0].split('\t')[0]):l.split('\n')[0].split('\t')[1] for l in lines[1:]}
    return label2id, id2label

In [230]:
label2id, id2label = get_label(path='/home/poac/AnomalyDetectionDataset/kdd_cup_2020/multimodal_labels.txt')

In [231]:
id2label

{0: 'top clothes (coat, jacket, shirt, etc.)',
 1: 'skirt & dress',
 2: 'bottom clothes (trousers, pants, etc.)',
 3: 'luggage, leather goods',
 4: 'shoes',
 5: 'accessories (jewelry, clothing accessories, belts, hats, scarves, etc.)',
 6: 'snacks, nuts, liquor and tea',
 7: 'makeup, perfume, beauty tools and essential oils',
 8: 'bottle drink',
 9: 'furniture',
 10: 'stationery',
 11: 'household electrical appliances',
 12: 'home decoration',
 13: 'household fabric',
 14: 'kitchenware',
 15: 'home / personal cleaning tools',
 16: 'storage supplies',
 17: 'motorcycle, motorcycle accessories, vehicles, bicycle and riding equipment',
 18: 'outdoor product',
 19: 'lighting',
 20: 'toys',
 21: 'underwear',
 22: 'digital supplies',
 23: 'bed linens',
 24: 'baby products',
 25: 'personal care',
 26: 'sporting goods',
 27: 'clothes (accessories, baby clothing, etc.)',
 28: 'others',
 29: 'human face',
 30: 'arm',
 31: 'hair',
 32: 'hand'}

In [232]:
label[0][0]

28

In [233]:
label_word = []
for i in range(len(label)):
    tem = ''
    for j in range(len(label[i])):
        # 将其他符号替换为逗号
        tem = tem + ',' + id2label[label[i][j]]
        tem = re.sub(" ",",",tem)
        tem = re.sub('\(', ",", tem)
        tem = re.sub('\)', ",", tem)
        tem = re.sub('\.', ',', tem)
        tem = re.sub('&', ',', tem)
    # 按逗号分隔
    tem = tem.split(",")
    # 过滤空字符串
    tem = [x for x in tem if x != '']

    tem = list(tem)
    label_word.append(tem)

In [234]:
def process_query(query, is_split=True):
    """
    query --> query index
    :param query:
    :return: longtensor
    """
    # split
    if is_split:
        query, max_len = split_query(query=query)
    print('max len:', max_len)
    # word --> index
    with open('/home/poac/code/Multi_modal_Retrieval/experiments/pretrained_models/bert-base-uncased-vocab.txt',
              mode="r", encoding="utf8") as f:
        lines = f.readlines()
        lines = [l.strip() for l in lines]
    word_index = {v: k for k, v in enumerate(lines)}
    # get len list
    len_list = []
    for i in range(len(query)):
        len_list.append(len(query[i]))
    len_list = np.array(len_list)
    # word --> index,  and padding
    query_index = []
    unsee = 0
    for i in range(len(query)):
        tem = query[i]
        tem_ = []
        for j in range(len(tem)):
            if tem[j] not in word_index:
                tem_.append(0)
                unsee += 1
            else:
                tem_.append(word_index[tem[j]])
        # padding 0
        if len_list[i] < max_len:
            for k in range(max_len - len_list[i]):
                tem_.append(0)
        tem_ = np.array(tem_).reshape(1, max_len)
        query_index.append(tem_)
    query_index = np.array(query_index).reshape(-1, max_len)
    print('unsee:', unsee)
    return Variable(torch.LongTensor(query_index))

In [235]:
def process_label(query):
    """
    label --> label index
    :param query:
    :return: longtensor
    """
    # word --> index
    with open('/home/poac/code/Multi_modal_Retrieval/experiments/pretrained_models/bert-base-uncased-vocab.txt',
              mode="r", encoding="utf8") as f:
        lines = f.readlines()
        lines = [l.strip() for l in lines]
    word_index = {v: k for k, v in enumerate(lines)}
    # get len list
    len_list = []
    for i in range(len(query)):
        len_list.append(len(query[i]))
    len_list = np.array(len_list)
    max_len = max(len_list)
    print('max len:', max_len)
    # word --> index,  and padding
    query_index = []
    unsee = 0
    for i in range(len(query)):
        tem = query[i]
        tem_ = []
        for j in range(len(tem)):
            if tem[j] not in word_index:
                tem_.append(0)
                unsee += 1
            else:
                tem_.append(word_index[tem[j]])
        # padding 0
        if len_list[i] < max_len:
            for k in range(max_len - len_list[i]):
                tem_.append(0)
        tem_ = np.array(tem_).reshape(1, max_len)
        query_index.append(tem_)
    query_index = np.array(query_index).reshape(-1, max_len)
    print('unsee:', unsee)
    return Variable(torch.LongTensor(query_index))

In [236]:
label_index = process_label(query=label_word)

max len: 146
unsee: 1817


In [239]:
label_index.shape

torch.Size([10000, 146])

In [238]:
label_word[0]

['others', 'others']

In [279]:
label_index.shape

torch.Size([10000, 146])

In [204]:
word_index['others']

2500

In [240]:
features = data['features']
boxes = data['boxes']

In [247]:
features[0].shape

(2, 2048)

In [248]:
boxes[0].shape

(2, 4)

In [252]:
# get max box
max_box = max(data['num_boxes'])
print('max box:', max_box)
features = np.array(features)
boxes = np.array(boxes)
image_seq = []
for i in range(features.shape[0]):
    f_tem = features[i]
    b_tem = boxes[i]
    cat = np.concatenate([f_tem, b_tem], axis=1)
    # 补齐
    if data['num_boxes'][i] < max_box:
        zeros = np.zeros((max_box-data['num_boxes'][i], cat.shape[1]))
        cat = np.concatenate([cat, zeros], axis=0)
    image_seq.append(cat)
image_seq = np.array(image_seq)

max box: 51


In [254]:
image_seq[0, :, :]

array([[  0.        ,   0.        ,   6.03786039, ...,   3.        ,
        338.        , 207.        ],
       [  0.        ,   0.        ,   3.45090771, ...,  22.        ,
        333.        , 204.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       ...,
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ]])

In [276]:
def convert_pos(num_boxes, boxes, H, W):
    """
    convert box position to 5-dim feature
    :param num_boxes:
    :param boxes:
    :param H:
    :param W:
    :return:
    """
    out = []
    for i in range(num_boxes.shape[0]):
        pos_list = []
        for j in range(num_boxes[i]):
            temp = boxes[i][j, :]
            pos_list.append([temp[0]/W[i], temp[2]/W[i], temp[1]/H[i], temp[3]/H[i], 
                             ((temp[2] - temp[0]) * (temp[3] - temp[1]))/(W[i]*H[i]), ])
            
        pos_list = np.array(pos_list)
        out.append(pos_list)
    return np.array(out)

In [273]:
a = convert_pos(num_boxes=data['num_boxes'], boxes=data['boxes'], H=data['image_h'], W=data['image_w'])

In [275]:
a[0]

array([[0.        , 1.61722488, 0.00887574, 0.61242604, 0.97607656],
       [1.14354067, 1.59330144, 0.06508876, 0.6035503 , 0.24217887]])

In [288]:
n = 0
with open('/home/poac/code/Multi_modal_Retrieval/experiments/pretrained_models/bert-base-uncased-vocab.txt', mode="r", encoding="utf8") as f:
    lines = f.readlines()
len(lines)

30522

In [285]:
n

1