This note book is for searching for the accessories that we listed in the dataset and generate a vector marking whether the car as such accessories for every car.

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import logging
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
import pandas as pd
from string import digits
from multiprocessing import Pool
import os, time

In [None]:
def load_csv():
    #load the original csv and fetch(id, accessories)
    ori_file = "../data/train.csv"
    # ori_file = "test.csv"
    df_ = pd.read_csv(ori_file)
    target_df = df_[['listing_id', 'accessories']]
    return target_df['listing_id'].values.tolist(), target_df['accessories'].values.tolist()

In [None]:
#search if the word is a substring of the latter string
def judge_substring(sub_str, tar_str):
    flag = True
    sub_str = sub_str.split(' ')
    for str_ in sub_str:
        if str_ not in tar_str:
            flag = False
            break
    return flag

In [None]:
#given a text, use bert to generate the embedded vector
def bert_embed(text, model, tokenizer):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    tokens_tensor = tokens_tensor.to('cuda')
    segments_tensors = torch.tensor([segments_ids])
    segments_tensors = segments_tensors.to('cuda')
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]
        token_vecs = hidden_states[-2][0]
        sentence_embedding = torch.mean(token_vecs, dim=0)
        return sentence_embedding

In [None]:
# judge if the sentence contains certain items using cosine similarity and substring_judge
def compare_items(text, item_list, vector_):
    # print("matched:")
    res_list = []
    threshold = 0.5
    for tp in item_list:
        target_vector = tp[2]
        sim = torch.cosine_similarity(vector_.reshape(1, -1), target_vector.reshape(1, -1))
        if int(sim) > threshold or judge_substring(tp[1], text):
            res_list.append(tp[0])
            # print(tp[1])
    res_list = list(set(res_list))
    return res_list

In [None]:
#call bert embeding function to generate the vector
def generate_vector(item_list, source_df, model, tokenizer):
    result = []
    remove_digits = str.maketrans('', '', digits)
    for index__, row in tqdm(source_df.iterrows()):
        # print("subprocess {} has finished {}/{} itr".format(num, cnt, total_length))
        # cnt += 1
        id_ = row['listing_id']
        text_ = str(row['accessories'])
        text_list = text_.replace('.', ',').split(',')
        temp_res = []
        for text in text_list:
            text = text.strip().replace('/', ' ').translate(remove_digits).lower()
            # print('=' * 20)
            # print("original_text: " + text)
            sentence_embedding = bert_embed(text, model, tokenizer)
            item_idxs = compare_items(text, item_list, sentence_embedding)
            temp_res.extend(item_idxs)
        result.append((id_, temp_res))
    return result

In [None]:
#load the item list we have
def item_list_generate(model, tokenizer):
    item_file = "../data/items.csv"
    item_df = pd.read_csv(item_file)
    id_list = item_df['Id'].values.tolist()
    item_list = item_df['Items'].values.tolist()
    res_list = []
    for i in range(len(item_list)):
        idx = id_list[i]
        items = item_list[i].split(',')
        for item in items:
            item = item.strip().lower()
            embd = bert_embed(item, model, tokenizer)
            res_list.append(tuple([idx, item, embd]))
    return res_list


In [None]:
#main function, call all the functions above and generate the result csv
id__, acc__ = load_csv()
print(len(id__))
acc_df = pd.DataFrame({'listing_id': id__, 'accessories': acc__})
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased',
                                    output_hidden_states=True,  # Whether the model returns all hidden-states.
                                    )
model.to('cuda')
item_list = item_list_generate(model, tokenizer)
total_dict = {}
print(acc_df.shape)
# temp_ = acc_df.drop_duplicates(subset=["listing_id"], keep="first")
# print(temp_.shape)
total_dict = generate_vector(item_list, acc_df, model, tokenizer)
ids_ = [i[0] for i in total_dict]
vectors = [i[1] for i in total_dict]
data_ = {'listing_id': ids_, 'accessories_vectors': vectors}
res_df = pd.DataFrame(data_)
res_df.to_csv("../data/embed_accessories.csv")
