# Sports14 Text/Image Feature Extraction

In [1]:

import os
import numpy as np
import pandas as pd

In [None]:
os.chdir('your dataset folder path')
os.getcwd()

## Load text data

In [3]:
i_id, desc_str = 'itemID', 'description'

file_path = './'
file_name = 'meta-sports.csv'

meta_file = os.path.join(file_path, file_name)

df = pd.read_csv(meta_file)
df.sort_values(by=[i_id], inplace=True)

print('data loaded!')
print(f'shape: {df.shape}')

df[:3]

data loaded!
shape: (18357, 10)


Unnamed: 0,itemID,asin,title,price,imUrl,related,brand,categories,salesRank,description
0,0,1881509818,Ghost Inc Glock Armorers Tool 3/32 Punch,9.99,http://ecx.images-amazon.com/images/I/21iMxsyD...,"{'also_bought': ['B000U3YWEM', 'B000U401J6', '...",Ghost,"[['Sports & Outdoors', 'Hunting & Fishing', 'H...",{'Sports &amp; Outdoors': 172909},Ghost Armorer Tool (1). The GAT is made with a...
1,1,2094869245,5 LED Bicycle Rear Tail Red Bike Torch Laser B...,8.26,http://ecx.images-amazon.com/images/I/51RtwnJw...,"{'also_bought': ['B0081O93N2', 'B00EYTCHJA', '...",,"[['Sports & Outdoors', 'Cycling', 'Lights & Re...",{'Sports &amp; Outdoors': 14293},This newly-designed Laser tail light can emit ...
2,2,7245456259,Black Mountain Products Single Resistance Band...,10.49,http://ecx.images-amazon.com/images/I/411Ikpf1...,"{'also_bought': ['B00DDBS2JE', 'B00H1KNHE8', '...",Black Mountain,"[['Sports & Outdoors', 'Exercise & Fitness', '...",{'Sports &amp; Outdoors': 1010},Black Mountain Products single resistance band...


In [4]:

# sentences: title + brand + category + description | All have title + description

title_na_df = df[df['title'].isnull()]
print(title_na_df.shape)

desc_na_df = df[df['description'].isnull()]
print(desc_na_df.shape)

na_df = df[df['description'].isnull() & df['title'].isnull()]
print(na_df.shape)

na3_df = df[df['description'].isnull() & df['title'].isnull() & df['brand'].isnull()]
print(na3_df.shape)

na4_df = df[df['description'].isnull() & df['title'].isnull() & df['brand'].isnull() & df['categories'].isnull()]
print(na4_df.shape)

(91, 10)
(2659, 10)
(40, 10)
(40, 10)
(0, 10)


In [5]:

df[desc_str] = df[desc_str].fillna(" ")
df['title'] = df['title'].fillna(" ")
df['brand'] = df['brand'].fillna(" ")
df['categories'] = df['categories'].fillna(" ")


In [6]:
sentences = []
for i, row in df.iterrows():
    sen = row['title'] + ' ' + row['brand'] + ' '
    cates = eval(row['categories'])
    if isinstance(cates, list):
        for c in cates[0]:
            sen = sen + c + ' '
    sen += row[desc_str]
    sen = sen.replace('\n', ' ')

    sentences.append(sen)

sentences[:10]

['Ghost Inc Glock Armorers Tool 3/32 Punch Ghost Sports & Outdoors Hunting & Fishing Hunting Gun Maintenance Gunsmithing Tools Ghost Armorer Tool (1). The GAT is made with a spring steel punch. The diameter is 3/32 of an inch or 2.5mm, this is the same as the OEM tool size. The difference is you will be able to press harder without bending the shaft of this punch. Just a better tool to work on your Glock with.',
 '5 LED Bicycle Rear Tail Red Bike Torch Laser Beam Lamp Light   Sports & Outdoors Cycling Lights & Reflectors Taillights This newly-designed Laser tail light can emit two parallel lines, to form a virtual lane together with the moving of bicycle on the road. LED flash light and  two lines not only enhance the waring effect strongly and greatly but also improve the safety of night riding.',
 'Black Mountain Products Single Resistance Band - Door Anchor and Starter Guide Included Black Mountain Sports & Outdoors Exercise & Fitness Accessories Exercise Bands Black Mountain Produc

In [7]:

course_list = df[i_id].tolist()
#sentences = df[desc_str].tolist()

assert course_list[-1] == len(course_list) - 1

In [8]:
# should `pip install sentence_transformers` first
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

sentence_embeddings = model.encode(sentences)
print('text encoded!')

assert sentence_embeddings.shape[0] == df.shape[0]
np.save(os.path.join(file_path, 'text_feat.npy'), sentence_embeddings)
print('done!')


text encoded!
done!


In [9]:
sentence_embeddings[:10]

array([[-0.12623426,  0.03341388, -0.01948772, ..., -0.1013338 ,
         0.0514545 ,  0.07334712],
       [ 0.0068029 ,  0.00055715, -0.03157376, ...,  0.03421347,
         0.02450724,  0.03113373],
       [-0.12395922,  0.05546276, -0.00272348, ..., -0.19819073,
         0.04171506,  0.05105354],
       ...,
       [-0.06516663,  0.04306812, -0.00357155, ...,  0.02348825,
        -0.02514204,  0.06650119],
       [ 0.05071206,  0.03823141, -0.04340539, ...,  0.00951272,
         0.05093095,  0.03292951],
       [-0.13305898,  0.07934257, -0.01714416, ..., -0.11284354,
        -0.00523037,  0.03694083]], dtype=float32)

In [10]:
load_txt_feat = np.load('text_feat.npy', allow_pickle=True)
print(load_txt_feat.shape)
load_txt_feat[:10]

(18357, 384)


array([[-0.12623426,  0.03341388, -0.01948772, ..., -0.1013338 ,
         0.0514545 ,  0.07334712],
       [ 0.0068029 ,  0.00055715, -0.03157376, ...,  0.03421347,
         0.02450724,  0.03113373],
       [-0.12395922,  0.05546276, -0.00272348, ..., -0.19819073,
         0.04171506,  0.05105354],
       ...,
       [-0.06516663,  0.04306812, -0.00357155, ...,  0.02348825,
        -0.02514204,  0.06650119],
       [ 0.05071206,  0.03823141, -0.04340539, ...,  0.00951272,
         0.05093095,  0.03292951],
       [-0.13305898,  0.07934257, -0.01714416, ..., -0.11284354,
        -0.00523037,  0.03694083]], dtype=float32)

# Image encoder (V0)，following LATTICE, averaging over for missed items

In [11]:
df[:5]

Unnamed: 0,itemID,asin,title,price,imUrl,related,brand,categories,salesRank,description
0,0,1881509818,Ghost Inc Glock Armorers Tool 3/32 Punch,9.99,http://ecx.images-amazon.com/images/I/21iMxsyD...,"{'also_bought': ['B000U3YWEM', 'B000U401J6', '...",Ghost,"[['Sports & Outdoors', 'Hunting & Fishing', 'H...",{'Sports &amp; Outdoors': 172909},Ghost Armorer Tool (1). The GAT is made with a...
1,1,2094869245,5 LED Bicycle Rear Tail Red Bike Torch Laser B...,8.26,http://ecx.images-amazon.com/images/I/51RtwnJw...,"{'also_bought': ['B0081O93N2', 'B00EYTCHJA', '...",,"[['Sports & Outdoors', 'Cycling', 'Lights & Re...",{'Sports &amp; Outdoors': 14293},This newly-designed Laser tail light can emit ...
2,2,7245456259,Black Mountain Products Single Resistance Band...,10.49,http://ecx.images-amazon.com/images/I/411Ikpf1...,"{'also_bought': ['B00DDBS2JE', 'B00H1KNHE8', '...",Black Mountain,"[['Sports & Outdoors', 'Exercise & Fitness', '...",{'Sports &amp; Outdoors': 1010},Black Mountain Products single resistance band...
3,3,7245456313,Black Mountain Products Resistance Band Set wi...,32.99,http://ecx.images-amazon.com/images/I/51FdHlZS...,"{'also_bought': ['1612431712', 'B00GSBMW2Y', '...",Black Mountain,"[['Sports & Outdoors', 'Exercise & Fitness', '...",{'Sports &amp; Outdoors': 15},[if gte mso 9]><xml> <o:OfficeDocumentSettings...
4,4,B000002NUS,Outers Universal 32-Piece Blow Molded Gun Clea...,21.99,http://ecx.images-amazon.com/images/I/510GjWgd...,"{'also_bought': ['B000PW64JY', 'B0010KHNEU', '...",Outers,"[['Sports & Outdoors', 'Hunting & Fishing', 'H...",{'Sports &amp; Outdoors': 26738},Outers now offers this rigid and durable hard ...


In [12]:
import array

def readImageFeatures(path):
  f = open(path, 'rb')
  while True:
    asin = f.read(10).decode('UTF-8')
    if asin == '': break
    a = array.array('f')
    a.fromfile(f, 4096)
    yield asin, a.tolist()

In [13]:

img_data = readImageFeatures("image_features_Sports_and_Outdoors.b")
item2id = dict(zip(df['asin'], df['itemID']))

feats = {}
avg = []
for d in img_data:
    if d[0] in item2id:
        feats[int(item2id[d[0]])] = d[1]
        avg.append(d[1])
avg = np.array(avg).mean(0).tolist()

ret = []
non_no = []
for i in range(len(item2id)):
    if i in feats:
        ret.append(feats[i])
    else:
        non_no.append(i)
        ret.append(avg)

print('# of items not in processed image features:', len(non_no))
assert len(ret) == len(item2id)
np.save('image_feat.npy', np.array(ret))
np.savetxt("missed_img_itemIDs.csv", non_no, delimiter =",", fmt ='%d')
print('done!')

# of items not in processed image features: 180
done!
