# Preprocessing Amazon2023 for Item Context Information

In [1]:
from typing import List

import os
from freerec.data.tags import USER, ITEM, RATING, TIMESTAMP
from freerec.data.preprocessing.amazon2023 import extract_from_amazon2023

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset: str = "Baby"
filedir: str = f"../data/Amazon2023/{dataset}"
image_size: str = 'large' # (thumb, large, vis)
inter_file = os.path.join(filedir, f"{dataset}.inter")
item_file = os.path.join(filedir, f"{dataset}.item")

In [3]:
inter_df, item_df = extract_from_amazon2023(filedir)
inter_df.head(5), item_df.head(5)

(                           USER        ITEM  RATING      TIMESTAMP parent_asin
 0  AGKASBHYZPGTEPO6LWZPVJWB2BVA  B004FM7VOW     4.0  1471546337000  B089MS68G8
 1  AGKASBHYZPGTEPO6LWZPVJWB2BVA  B01E5E703G     5.0  1471542244000  B01E5E703G
 2  AGKASBHYZPGTEPO6LWZPVJWB2BVA  B00F463XV8     1.0  1452650881000  B00F9386Q8
 3  AGCI7FAH4GL5FI65HYLKWTMFZ2CQ  B0007V644S     5.0  1408994051000  B07RRDX26B
 4  AGCI7FAH4GL5FI65HYLKWTMFZ2CQ  B002LARFLY     5.0  1349818961000  B00OLRJET6,
          ITEM parent_asin                                              title  \
 0  B004FM7VOW  B089MS68G8  Wildkin Original Nap Mat with Reusable Pillow ...   
 1  B01E5E703G  B01E5E703G  Swim Diaper, Angel Love 2Pcs Pack Big One Size...   
 2  B00F463XV8  B00F9386Q8                  Mud Pie Candy Monster Bags, Black   
 3  B0007V644S  B07RRDX26B                 Sassy 250ct Diaper Sacks- Lavender   
 4  B002LARFLY  B00OLRJET6  Spasilk Bath Hooded Towels & Washcloths Set fo...   
 
                               

In [4]:

# text converter that merges texts for each field
def text_converter(texts):
    if isinstance(texts, str):
        merged = texts
    elif isinstance(texts, List):
        merged = '; '.join([f"{text.strip()}" for i, text in enumerate(texts, start=1)])
    elif isinstance(texts, dict):
        merged = '; '.join([f"{key}: {text.strip()}" for key, text in texts.items()])
    else:
        merged = ' '
    return merged.strip()

# image url converter that selects one of image urls
def image_url_converter(image_urls: List, image_size: str = image_size):
    if len(image_urls) == 0:
        return ''
    else:
        return image_urls[0][image_size]

In [5]:
# convert text fields
# text_fields = ('title', 'features', 'description')
text_fields = ('title', 'brand', 'categories')
for field in text_fields:
    item_df[field] = item_df[field].map(text_converter)

item_df['text'] = item_df.apply(
    lambda row: " ".join([f"{field.title()}: {row[field]}." for field in text_fields]),
    axis=1
)

In [6]:
# convert image urls
item_df['image_url'] = item_df['image_urls'].map(image_url_converter)

In [7]:
# .inter: USER, ITEM, RATING, TIMESTAMP
# .item: ITEM, Text, Image URL
inter_df = inter_df[[USER.name, ITEM.name, RATING.name, TIMESTAMP.name]]
item_df = item_df[[ITEM.name, 'text', 'image_url']]
inter_df.head(5), item_df.head(5)

(                           USER        ITEM  RATING      TIMESTAMP
 0  AGKASBHYZPGTEPO6LWZPVJWB2BVA  B004FM7VOW     4.0  1471546337000
 1  AGKASBHYZPGTEPO6LWZPVJWB2BVA  B01E5E703G     5.0  1471542244000
 2  AGKASBHYZPGTEPO6LWZPVJWB2BVA  B00F463XV8     1.0  1452650881000
 3  AGCI7FAH4GL5FI65HYLKWTMFZ2CQ  B0007V644S     5.0  1408994051000
 4  AGCI7FAH4GL5FI65HYLKWTMFZ2CQ  B002LARFLY     5.0  1349818961000,
          ITEM                                               text  \
 0  B004FM7VOW  Title: Wildkin Original Nap Mat with Reusable ...   
 1  B01E5E703G  Title: Swim Diaper, Angel Love 2Pcs Pack Big O...   
 2  B00F463XV8  Title: Mud Pie Candy Monster Bags, Black.\nBra...   
 3  B0007V644S  Title: Sassy 250ct Diaper Sacks- Lavender.\nBr...   
 4  B002LARFLY  Title: Spasilk Bath Hooded Towels & Washcloths...   
 
                                            image_url  
 0  https://m.media-amazon.com/images/I/41kuAI0+-x...  
 1  https://m.media-amazon.com/images/I/61KzMipUA2...  
 2  htt

In [8]:
# Save
inter_df.to_csv(inter_file, sep='\t', index=False)
item_df.to_csv(item_file, sep='\t', index=False)

In [9]:
item_df['text'][0]

'Title: Terra Tattoos Tropical Hawaiian Metallic Tattoos - 75 Gold Silver Temporary Tattoos Turtles, Dolphins, Stars, Sun, Moon, Starfish, Seahorse, Coral, Palm Trees, Hibiscuses, Puka Shells & more!.\nBrand: Terra Tattoos.\nCategories: Beauty & Personal Care; Makeup; Body; Temporary Tattoos.'

After preprocessing the dataset, please using the following command to prepare filtered and splitted dataset:

```bash
freerec make Amazon2023{dataset} --root ../data --filedir Amazon2023/{dataset} -ku 10 ki 10 -sp 4 --ratios 8,1,1 --splitting ROU
```