# Sports14 Text/Image Feature Extraction

In [1]:

import os
import numpy as np
import pandas as pd

In [2]:
os.chdir('/home/bjf/bjf_projects/MMRec/data/pet')
os.getcwd()

'/home/bjf/bjf_projects/MMRec/data/pet'

## Load text data

In [3]:
i_id, desc_str = 'itemID', 'description'

file_path = './'
file_name = 'meta-Pet_Supplies.csv'

meta_file = os.path.join(file_path, file_name)

df = pd.read_csv(meta_file)# 读取CSV文件到DataFrame，并按项目ID排序
df.sort_values(by=[i_id], inplace=True)

print('data loaded!')
print(f'shape: {df.shape}')

df[:3]

data loaded!
shape: (8510, 10)


Unnamed: 0,itemID,asin,related,title,price,salesRank,imUrl,brand,categories,description
0,0,1223000893,"{'also_bought': ['B000G1KNPK', 'B00BYHJ6UI', '...","Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",29.5,{'Pet Supplies': 39514},http://ecx.images-amazon.com/images/I/41yr4bsw...,,"[['Pet Supplies', 'Cats', 'Toys']]",This combo pack provides you with all 3 Cat Si...
1,1,4847676011,"{'also_bought': ['B006YFZXGG', 'B002KMDNE6', '...",Natural Toothpaste - 2-1/2 oz.for dogs,6.83,{'Pet Supplies': 6424},http://ecx.images-amazon.com/images/I/41RnSg%2...,St. Jon,"[['Pet Supplies', 'Dogs', 'Health Supplies', '...","In this day and age, we're more concerned and ..."
2,2,9376674824,"{'also_bought': ['B000JMJ4CE', 'B003FVK47A', '...",Solid Gold S.E.P. (Stop Eating Poop) 3.5oz,10.77,{'Pet Supplies': 5295},http://ecx.images-amazon.com/images/I/41S0Ds8n...,Solid Gold,"[['Pet Supplies', 'Dogs', 'Health Supplies', '...",Stop Eating Poop contains Glutamic Acid to det...


In [4]:

# sentences: title + brand + category + description | All have title + description

title_na_df = df[df['title'].isnull()]# 根据数据集中'title'列为空的情况筛选出缺失标题的数据子集，并打印其形状
print(title_na_df.shape)

desc_na_df = df[df['description'].isnull()]# 根据数据集中'description'列为空的情况筛选出缺失描述的数据子集，并打印其形状
print(desc_na_df.shape)

na_df = df[df['description'].isnull() & df['title'].isnull()]# 筛选出同时缺失'title'和'description'列的数据子集，并打印其形状
print(na_df.shape)

na3_df = df[df['description'].isnull() & df['title'].isnull() & df['brand'].isnull()]# 筛选出'description'、'title'和'brand'列均为空的数据子集，并打印其形状
print(na3_df.shape)

na4_df = df[df['description'].isnull() & df['title'].isnull() & df['brand'].isnull() & df['categories'].isnull()]# 筛选出'description'、'title'、'brand'和'categories'列均为空的数据子集，并打印其形状
print(na4_df.shape)

(15, 10)
(1080, 10)
(15, 10)
(15, 10)
(0, 10)


In [5]:
# 对描述字符串列以及其它几列填充NaN值为普通空格
df[desc_str] = df[desc_str].fillna(" ")
df['title'] = df['title'].fillna(" ")
df['brand'] = df['brand'].fillna(" ")
df['categories'] = df['categories'].fillna(" ")


In [6]:
sentences = []# 初始化一个空列表，用于存储处理后的句子
for i, row in df.iterrows():# 遍历DataFrame的每一行，提取并组合需要的信息
    sen = row['title'] + ' ' + row['brand'] + ' '# 将标题和品牌信息合并为一个新的句子
    cates = eval(row['categories'])# 解析并处理类别信息
    if isinstance(cates, list):# 检查类别信息是否为列表，是的话进一步处理
        for c in cates[0]:# 将每个类别信息添加到句子中
            sen = sen + c + ' '
    sen += row[desc_str]# 将描述信息添加到句子中，并替换掉不必要的换行符
    sen = sen.replace('\n', ' ')# 将处理后的句子添加到列表中

    sentences.append(sen)

sentences[:10]

["Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3   Pet Supplies Cats Toys This combo pack provides you with all 3 Cat Sitter DVD's. Keep your cats entertained morning, noon and night with our hilarious Go Cat Fun Pack DVDs and Catnip. Going out to dinner? Leaving for work? Keep Fluffy distracted and curious with footage of birds, butterflies, chipmunks, fish and more, complete with sound. Go Cat Combo Pack Cat Sitter DVD Set Features: Cat Sitter DVD Volume 1 Cat Sitter DVD Volume 2 Platinum Edition and the latest Cat Sitter Volume 3 - Gone Fishing",
 "Natural Toothpaste - 2-1/2 oz.for dogs St. Jon Pet Supplies Dogs Health Supplies Dental Care Toothpaste In this day and age, we're more concerned and aware of the chemicals and additives in the products we use for our dogs. This toothpaste contains only natural ingredients. No artificial preservatives or additives. Contains natural abrasives to safely remove plaque. For best results use 2 to 3 times a week. Ingredients - sorbitol, purifi

In [7]:

course_list = df[i_id].tolist()
#sentences = df[desc_str].tolist()

assert course_list[-1] == len(course_list) - 1

In [8]:
# should `pip install sentence_transformers` first----MMRS环境已安装
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/stsb-roberta-large')# 初始化SentenceTransformer模型，用于文本嵌入#原来的模型使用的是all-MiniLM-L6-v2目前表现不是最优但是速度最快；all-MiniLM-L12-v1性能更好

sentence_embeddings = model.encode(sentences)# 对句子列表进行编码，得到句子的嵌入向量
print('text encoded!')

assert sentence_embeddings.shape[0] == df.shape[0]# 断言编码后的句子嵌入数量与原始数据数量一致
np.save(os.path.join(file_path, 'text_feat.npy'), sentence_embeddings)# 将句子嵌入向量保存为Numpy文件
print('done!')


  from tqdm.autonotebook import tqdm, trange


text encoded!
done!


In [9]:
sentence_embeddings[:10]

array([[ 0.1420442 , -0.01163902, -0.3023617 , ...,  0.6746849 ,
        -2.2539864 , -0.4122411 ],
       [-0.27704102, -0.40520045, -0.7750341 , ..., -0.12097083,
         0.58644056,  0.6641146 ],
       [-0.61060876, -0.97433203, -0.2441792 , ...,  0.88061196,
        -0.381086  , -0.41241136],
       ...,
       [-0.04272173, -0.81557673, -1.235492  , ...,  0.49767768,
        -0.26564866, -0.36658448],
       [-0.08430825,  0.25650015, -1.5931408 , ...,  0.75029767,
         0.18671128, -0.12336086],
       [-0.30899674, -0.12002388, -0.609143  , ..., -0.13782161,
        -0.5503508 ,  0.2284444 ]], dtype=float32)

In [10]:
load_txt_feat = np.load('text_feat.npy', allow_pickle=True)
print(load_txt_feat.shape)
load_txt_feat[:10]

(8510, 1024)


array([[ 0.1420442 , -0.01163902, -0.3023617 , ...,  0.6746849 ,
        -2.2539864 , -0.4122411 ],
       [-0.27704102, -0.40520045, -0.7750341 , ..., -0.12097083,
         0.58644056,  0.6641146 ],
       [-0.61060876, -0.97433203, -0.2441792 , ...,  0.88061196,
        -0.381086  , -0.41241136],
       ...,
       [-0.04272173, -0.81557673, -1.235492  , ...,  0.49767768,
        -0.26564866, -0.36658448],
       [-0.08430825,  0.25650015, -1.5931408 , ...,  0.75029767,
         0.18671128, -0.12336086],
       [-0.30899674, -0.12002388, -0.609143  , ..., -0.13782161,
        -0.5503508 ,  0.2284444 ]], dtype=float32)

# Image encoder (V0)，following LATTICE, averaging over for missed items

In [11]:
df[:5]

Unnamed: 0,itemID,asin,related,title,price,salesRank,imUrl,brand,categories,description
0,0,1223000893,"{'also_bought': ['B000G1KNPK', 'B00BYHJ6UI', '...","Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",29.5,{'Pet Supplies': 39514},http://ecx.images-amazon.com/images/I/41yr4bsw...,,"[['Pet Supplies', 'Cats', 'Toys']]",This combo pack provides you with all 3 Cat Si...
1,1,4847676011,"{'also_bought': ['B006YFZXGG', 'B002KMDNE6', '...",Natural Toothpaste - 2-1/2 oz.for dogs,6.83,{'Pet Supplies': 6424},http://ecx.images-amazon.com/images/I/41RnSg%2...,St. Jon,"[['Pet Supplies', 'Dogs', 'Health Supplies', '...","In this day and age, we're more concerned and ..."
2,2,9376674824,"{'also_bought': ['B000JMJ4CE', 'B003FVK47A', '...",Solid Gold S.E.P. (Stop Eating Poop) 3.5oz,10.77,{'Pet Supplies': 5295},http://ecx.images-amazon.com/images/I/41S0Ds8n...,Solid Gold,"[['Pet Supplies', 'Dogs', 'Health Supplies', '...",Stop Eating Poop contains Glutamic Acid to det...
3,3,B00002N8FK,"{'also_bought': ['B000IGGFEQ', 'B000IGGH6W', '...",Heath Manufacturing S-1-8 Single Hanging Suet ...,1.77,"{'Patio, Lawn & Garden': 51680}",http://ecx.images-amazon.com/images/I/51JbAY9m...,Heath,"[['Pet Supplies', 'Birds', 'Feeding & Watering...",Squirrel resistant. Holds one Heath Suet Cake....
4,4,B00004X14K,"{'also_bought': ['B0002ASCQ2', 'B000QS5B6Y', '...",Bergan Stack-N-Stor 40 Stackable Storage,22.85,{'Pet Supplies': 3960},http://ecx.images-amazon.com/images/I/41AvjdJU...,Bergan,"[['Pet Supplies', 'Cats', 'Feeding & Watering ...",With Bergan's Stack-N-Store 100 Stackable Stor...


In [12]:
import array

def readImageFeatures(path):
  f = open(path, 'rb')
  while True:
    asin = f.read(10).decode('UTF-8')
    if asin == '': break
    a = array.array('f')
    a.fromfile(f, 4096)
    yield asin, a.tolist()

In [13]:
# 读取图像特征数据
img_data = readImageFeatures("image_features_Pet_Supplies.b")
item2id = dict(zip(df['asin'], df['itemID']))# 将商品ID（asin）映射到项目ID（itemID）
# 初始化图像特征字典和平均特征列表
feats = {}
avg = []
for d in img_data:
    if d[0] in item2id:
        feats[int(item2id[d[0]])] = d[1]
        avg.append(d[1])
avg = np.array(avg).mean(0).tolist()# 将平均特征转换为列表
# 初始化最终特征列表和未找到特征的项目ID列表
ret = []
non_no = []
for i in range(len(item2id)):# 遍历所有项目ID，如果存在特征则添加，否则添加平均特征，并记录未找到特征的项目ID
    if i in feats:
        ret.append(feats[i])
    else:
        non_no.append(i)
        ret.append(avg)

print('# of items not in processed image features:', len(non_no))# 打印未在处理的图像特征中找到的项目数量
assert len(ret) == len(item2id)# 确保特征列表和项目ID字典的长度相同
np.save('image_feat.npy', np.array(ret))# 保存图像特征和未找到特征的项目ID
np.savetxt("missed_img_itemIDs.csv", non_no, delimiter =",", fmt ='%d')
print('done!')

# of items not in processed image features: 74
done!
