In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import requests
from PIL import Image
import tokenization
import pickle



In [2]:
data = pd.read_csv('data.csv', index_col=0)
data

Unnamed: 0,product_spu_name,product_picture,property
0,jason/捷森 低脂牛奶 240ml 德国进口,http://p1.meituan.net/sgopen/18c6fae4044b41910...,瓶装
1,【24盒】蒙牛特仑苏有机纯牛奶利乐梦幻盖250ml×24包,http://p0.meituan.net/xianfu/a016be57a13051a20...,箱装
2,菊乐250ml纯牛奶,http://p0.meituan.net/xianfu/12c40a4415a2619fe...,盒装
3,伊利 利乐包纯牛奶250mL*16盒 /箱,http://p0.meituan.net/wmproduct/815e983175af66...,箱装
4,【整箱】伊利 营养纯牛奶小纯奶 250ml*24盒/箱,https://p0.meituan.net/xianfu/e4724d6cfcc2f23f...,箱装
...,...,...,...
18148,托本莫瑞 12年单一麦芽威士忌 700ml/瓶,http://p1.meituan.net/xianfu/21385b5b7ff21cb42...,瓶装
18149,日本 响威士忌【正品行货 防伪码】洋酒烈酒 700ml/瓶,http://p0.meituan.net/scproduct/cf68bcced2a3b5...,瓶装
18150,格兰菲迪 麦芽威士忌40度 700ml盒装,http://p0.meituan.net/waimaidpoipicmining/6447...,盒装
18151,道格拉斯 美食家麦芽威士忌洋酒 700ml/瓶,http://p1.meituan.net/wmproduct/13c24535b35c99...,瓶装


In [3]:
data['text'] = data['product_spu_name'] + '\t' + '"包装方式"属性为' + data['property']

In [4]:
data['property'].value_counts()

瓶装     11957
盒装      1715
箱装      1686
罐装      1447
袋装       530
连包装      259
礼盒装      232
桶装       219
杯装        92
坛装         6
散装         3
Name: property, dtype: int64

In [5]:
def split_and_tokenize(all_documents, tokenizer, data_pd, prod_num):
    raw_text = data_pd['text'].iloc[prod_num]
    # 得到由\t分隔的两个句子
    raw_text = raw_text.replace("<eop>","")
    # 将eop替换为空
    raw_text = tokenization.convert_to_unicode(raw_text)
    # 转为unicode
    two_sentence = raw_text.strip('\n').split('\t')
    # 按照\t分隔
    for i in range(len(two_sentence)):
        tokens = tokenizer.tokenize(two_sentence[i])
        all_documents[-1].append(tokens)
    all_documents.append([])
    # 进行tokenize basic_tokenize + wordpiece tokenize

In [7]:
all_documents = [[]]
tokenizer = tokenization.FullTokenizer(
        vocab_file='vocab.txt', do_lower_case=True)

In [8]:
for i in range(data.shape[0]):
    split_and_tokenize(all_documents, tokenizer, data, i)

In [9]:
all_documents = [x for x in all_documents if x]

In [130]:
def create_masked_lm_predictions(tokens):
    if tokens[-3] in ['瓶', '箱', '盒', '袋', '罐', '杯', '桶', '散', '坛']:
        masked_lm_labels = [tokens[-3]]
        masked_lm_positions = [len(tokens)-3]
    elif tokens[-4] in ['礼', '连']:
        masked_lm_labels = [tokens[-4]]
        masked_lm_positions = [len(tokens)-4]
        # 进行mask 得到真实值和mask位置
    else:
        print(tokens)
        return
    output_tokens = tokens[:masked_lm_positions[0]] + ["[MASK]"] + ["[SEP]"]
    # 句子 + [MASK] + [SEP]
    return output_tokens, masked_lm_positions, masked_lm_labels

def create_instances_from_document(all_documents, num, tokenizer):
    document = all_documents[num]
    tokens = []
    tokens.append("[CLS]")
    tokens.append("[IMAGE]")
    token_type_ids = [0, 0]
    for i in range(len(document)):
        for token in document[i]:
            tokens.append(token)
            if i == 0:
                token_type_ids.append(0)
        tokens.append("[SEP]")
    # [CLS] + [IMAGE] + 文本1 + [SEP] + 文本2 + [SEP]
    tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(tokens)
    while len(token_type_ids) < len(tokens):
        token_type_ids.append(1)
        # 对文本2 token为1
    data_dict = all_data_to_np(tokens, masked_lm_positions, masked_lm_labels, token_type_ids, tokenizer)
    return data_dict

def all_data_to_np(tokens, masked_lm_positions, masked_lm_labels, token_type_ids, tokenizer, max_seq_len=512):
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    while len(input_ids) < max_seq_len:
        input_ids.append(0)
        token_type_ids.append(0)
        # pad 0
    masked_lm_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)
    input_ids = np.array(input_ids, dtype=np.int32)
    token_type_ids = np.array(token_type_ids, dtype=np.int32)
    masked_lm_positions = np.array(masked_lm_positions, dtype=np.int32)
    masked_lm_ids = np.array(masked_lm_ids, dtype=np.int32)
    data_dict = dict()
    data_dict['input_ids'] = input_ids
    data_dict['token_type_ids'] = token_type_ids
    data_dict['masked_lm_positions'] = masked_lm_positions
    data_dict['masked_lm_ids'] = masked_lm_ids
    return data_dict

In [133]:
dict_ = create_instances_from_document(all_documents, 2, tokenizer)

In [137]:
all_data = dict()
for i in range(len(all_documents)):
    each_dict = create_instances_from_document(all_documents, i, tokenizer)
    all_data[str(i).zfill(5)] = each_dict

In [147]:
text_files = open('text_data.pkl', 'wb')
pickle.dump(all_data, text_files)
text_files.close()

In [141]:
image_embedding_file = open('image_embedding.pkl', 'rb')
image_embedding = pickle.load(image_embedding_file)
image_embedding_file.close()

In [145]:
len(image_embedding.keys())

18146

In [146]:
len(all_data.keys())

18146

In [156]:
two_modal_data = dict()

In [157]:
for i in range(len(image_embedding.keys())):
    key = str(i).zfill(5)
    two_modal_data[i] = dict()
    two_modal_data[i]['image_embedding'] = image_embedding[key]
    two_modal_data[i]['input_ids'] = all_data[key]['input_ids']
    two_modal_data[i]['token_type_ids'] = all_data[key]['token_type_ids']
    two_modal_data[i]['masked_lm_positions'] = all_data[key]['masked_lm_positions']
    two_modal_data[i]['masked_lm_ids'] = all_data[key]['masked_lm_ids']

In [160]:
two_modal_data_file = open('image_text_file.pkl', 'wb')
pickle.dump(two_modal_data, two_modal_data_file)
two_modal_data_file.close()